{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 5607, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 191.71875, "epoch": 0.0005350454788657035, "grad_norm": 0.6280093193054199, "kl": 0.0, "learning_rate": 8.9126559714795e-09, "loss": -0.0, "reward": 0.10365624725818634, "reward_std": 0.09572215378284454, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.015625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08803124725818634, "step": 1 }, { "completion_length": 184.15625, "epoch": 0.001070090957731407, "grad_norm": 3.079563856124878, "kl": 0.0, "learning_rate": 1.7825311942959e-08, "loss": 0.0, "reward": 0.07984374463558197, "reward_std": 0.19249790906906128, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.04859375208616257, "step": 2 }, { "completion_length": 175.6875, "epoch": 0.0016051364365971107, "grad_norm": 1.9809600114822388, "kl": 0.000469846127089113, "learning_rate": 2.6737967914438503e-08, "loss": 0.0, "reward": 0.10203124582767487, "reward_std": 0.17895342409610748, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.07078124582767487, "step": 3 }, { "completion_length": 184.9375, "epoch": 0.002140181915462814, "grad_norm": 1.0554888248443604, "kl": 0.0011950371554121375, "learning_rate": 3.5650623885918e-08, "loss": 0.0, "reward": 0.13637499511241913, "reward_std": 0.323363721370697, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.04262499511241913, "step": 4 }, { "completion_length": 188.375, "epoch": 0.002675227394328518, "grad_norm": 1.1760318279266357, "kl": 0.0015934794209897518, "learning_rate": 4.456327985739751e-08, "loss": 0.0001, "reward": 0.062406253069639206, "reward_std": 0.12230199575424194, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.06240624934434891, "step": 5 }, { "completion_length": 172.59375, "epoch": 0.0032102728731942215, "grad_norm": 0.9431440234184265, "kl": 0.002046941313892603, "learning_rate": 5.3475935828877005e-08, "loss": 0.0001, "reward": 0.5207499861717224, "reward_std": 0.7556440234184265, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.140625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1301250010728836, "step": 6 }, { "completion_length": 181.78125, "epoch": 0.003745318352059925, "grad_norm": 0.5845279693603516, "kl": 0.002756249625235796, "learning_rate": 6.238859180035651e-08, "loss": 0.0001, "reward": 0.10709375888109207, "reward_std": 0.1459238976240158, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.07584375143051147, "step": 7 }, { "completion_length": 183.125, "epoch": 0.004280363830925628, "grad_norm": 0.6257660388946533, "kl": 0.0017757145687937737, "learning_rate": 7.1301247771836e-08, "loss": 0.0001, "reward": 0.1185312420129776, "reward_std": 0.10124413669109344, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1185312420129776, "step": 8 }, { "completion_length": 190.4375, "epoch": 0.004815409309791332, "grad_norm": 0.6991696953773499, "kl": 0.005310946609824896, "learning_rate": 8.021390374331552e-08, "loss": 0.0002, "reward": 0.11093749850988388, "reward_std": 0.08968326449394226, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.015625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09531249850988388, "step": 9 }, { "completion_length": 172.125, "epoch": 0.005350454788657036, "grad_norm": 0.6007075905799866, "kl": 0.0014581300783902407, "learning_rate": 8.912655971479501e-08, "loss": 0.0001, "reward": 0.13378125429153442, "reward_std": 0.1750607192516327, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10253125429153442, "step": 10 }, { "completion_length": 183.96875, "epoch": 0.005885500267522739, "grad_norm": 1.5717180967330933, "kl": 0.011658621951937675, "learning_rate": 9.803921568627452e-08, "loss": 0.0005, "reward": 0.14615625143051147, "reward_std": 0.3221757709980011, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.021156249567866325, "step": 11 }, { "completion_length": 174.0, "epoch": 0.006420545746388443, "grad_norm": 0.9787147641181946, "kl": 0.0017083825077861547, "learning_rate": 1.0695187165775401e-07, "loss": 0.0001, "reward": 0.08474999666213989, "reward_std": 0.14751693606376648, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.015625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.06912499666213989, "step": 12 }, { "completion_length": 171.25, "epoch": 0.006955591225254147, "grad_norm": 2.602705240249634, "kl": 0.003725790185853839, "learning_rate": 1.1586452762923353e-07, "loss": 0.0001, "reward": 0.3565312623977661, "reward_std": 0.6755238175392151, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.078125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09090624749660492, "step": 13 }, { "completion_length": 185.25, "epoch": 0.00749063670411985, "grad_norm": 38101917696.0, "kl": 16402053.0, "learning_rate": 1.2477718360071302e-07, "loss": 656082.125, "reward": 0.15703123807907104, "reward_std": 0.29019200801849365, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.06328124552965164, "step": 14 }, { "completion_length": 170.90625, "epoch": 0.008025682182985553, "grad_norm": 183.7782440185547, "kl": 38.70970153808594, "learning_rate": 1.3368983957219251e-07, "loss": 1.5484, "reward": 0.3413437604904175, "reward_std": 0.4988768696784973, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12259374558925629, "step": 15 }, { "completion_length": 184.90625, "epoch": 0.008560727661851257, "grad_norm": 1.299963116645813, "kl": 0.002406194806098938, "learning_rate": 1.42602495543672e-07, "loss": 0.0001, "reward": 0.06987500190734863, "reward_std": 0.1164725124835968, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.06987500190734863, "step": 16 }, { "completion_length": 188.9375, "epoch": 0.00909577314071696, "grad_norm": 6052494.0, "kl": 33832.546875, "learning_rate": 1.5151515151515152e-07, "loss": 1353.3015, "reward": 0.027812495827674866, "reward_std": 0.12716242671012878, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.027812499552965164, "step": 17 }, { "completion_length": 179.875, "epoch": 0.009630818619582664, "grad_norm": 1.9883971214294434, "kl": 0.002975866897031665, "learning_rate": 1.6042780748663104e-07, "loss": 0.0001, "reward": 0.17018748819828033, "reward_std": 0.23126643896102905, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10768750309944153, "step": 18 }, { "completion_length": 185.5, "epoch": 0.010165864098448368, "grad_norm": 1.8181746006011963, "kl": 0.0044852192513644695, "learning_rate": 1.6934046345811053e-07, "loss": 0.0002, "reward": 0.1548749953508377, "reward_std": 0.11923106014728546, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12362499535083771, "step": 19 }, { "completion_length": 165.1875, "epoch": 0.010700909577314071, "grad_norm": 1.1429587602615356, "kl": 0.007865410298109055, "learning_rate": 1.7825311942959003e-07, "loss": 0.0003, "reward": 0.11534374952316284, "reward_std": 0.10804031789302826, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11534375697374344, "step": 20 }, { "completion_length": 176.90625, "epoch": 0.011235955056179775, "grad_norm": 2.0425732135772705, "kl": 0.00233033811673522, "learning_rate": 1.8716577540106952e-07, "loss": 0.0001, "reward": 0.2839062511920929, "reward_std": 0.35050711035728455, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.09375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1276562511920929, "step": 21 }, { "completion_length": 179.5625, "epoch": 0.011771000535045479, "grad_norm": 0.6571725010871887, "kl": 0.0024692818988114595, "learning_rate": 1.9607843137254904e-07, "loss": 0.0001, "reward": 0.26237499713897705, "reward_std": 0.3545153737068176, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10612499713897705, "step": 22 }, { "completion_length": 170.5, "epoch": 0.012306046013911182, "grad_norm": 0.9374416470527649, "kl": 0.012939373031258583, "learning_rate": 2.0499108734402856e-07, "loss": 0.0005, "reward": 0.2993437647819519, "reward_std": 0.5451701879501343, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.09375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08059374988079071, "step": 23 }, { "completion_length": 182.75, "epoch": 0.012841091492776886, "grad_norm": 0.6387389898300171, "kl": 0.002166611375287175, "learning_rate": 2.1390374331550802e-07, "loss": 0.0001, "reward": 0.32865625619888306, "reward_std": 0.41992613673210144, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.07865625619888306, "step": 24 }, { "completion_length": 185.28125, "epoch": 0.01337613697164259, "grad_norm": 3.8845043182373047, "kl": 0.03007567673921585, "learning_rate": 2.2281639928698754e-07, "loss": 0.0012, "reward": 0.19203124940395355, "reward_std": 0.34487831592559814, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.046875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08265624940395355, "step": 25 }, { "completion_length": 185.03125, "epoch": 0.013911182450508293, "grad_norm": 0.7048884034156799, "kl": 0.0026221787557005882, "learning_rate": 2.3172905525846706e-07, "loss": 0.0001, "reward": 0.1913125067949295, "reward_std": 0.31482741236686707, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.015625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1131875067949295, "step": 26 }, { "completion_length": 189.6875, "epoch": 0.014446227929373997, "grad_norm": 1.3865783214569092, "kl": 0.004993971437215805, "learning_rate": 2.4064171122994655e-07, "loss": 0.0002, "reward": 0.18649998307228088, "reward_std": 0.33604902029037476, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09274999797344208, "step": 27 }, { "completion_length": 187.03125, "epoch": 0.0149812734082397, "grad_norm": 2.0572826862335205, "kl": 0.002402370097115636, "learning_rate": 2.4955436720142604e-07, "loss": 0.0001, "reward": 0.08759374916553497, "reward_std": 0.19255495071411133, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.056343741714954376, "step": 28 }, { "completion_length": 187.6875, "epoch": 0.015516318887105404, "grad_norm": 23.313865661621094, "kl": 0.04833727702498436, "learning_rate": 2.584670231729056e-07, "loss": 0.0019, "reward": 0.1198437511920929, "reward_std": 0.17905518412590027, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.046875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0729687511920929, "step": 29 }, { "completion_length": 195.8125, "epoch": 0.016051364365971106, "grad_norm": 2.121685028076172, "kl": 0.0021614041179418564, "learning_rate": 2.6737967914438503e-07, "loss": 0.0001, "reward": 0.15662500262260437, "reward_std": 0.30396193265914917, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.046875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.04725000262260437, "step": 30 }, { "completion_length": 177.59375, "epoch": 0.01658640984483681, "grad_norm": 0.9327354431152344, "kl": 0.0019120838260278106, "learning_rate": 2.7629233511586457e-07, "loss": 0.0001, "reward": 0.1314687430858612, "reward_std": 0.15068717300891876, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1002187505364418, "step": 31 }, { "completion_length": 179.53125, "epoch": 0.017121455323702513, "grad_norm": 0.8987910747528076, "kl": 0.0025291701313108206, "learning_rate": 2.85204991087344e-07, "loss": 0.0001, "reward": 0.18443751335144043, "reward_std": 0.16988538205623627, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.046875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13756249845027924, "step": 32 }, { "completion_length": 185.71875, "epoch": 0.01765650080256822, "grad_norm": 1.7469781637191772, "kl": 0.0028533004224300385, "learning_rate": 2.9411764705882356e-07, "loss": 0.0001, "reward": 0.09262499958276749, "reward_std": 0.14116664230823517, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.015625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.07699999958276749, "step": 33 }, { "completion_length": 190.0625, "epoch": 0.01819154628143392, "grad_norm": 2.2309024333953857, "kl": 0.0023159862030297518, "learning_rate": 3.0303030303030305e-07, "loss": 0.0001, "reward": 0.20990625023841858, "reward_std": 0.3060622811317444, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11615625023841858, "step": 34 }, { "completion_length": 190.84375, "epoch": 0.018726591760299626, "grad_norm": 1.039546251296997, "kl": 0.002171012805774808, "learning_rate": 3.1194295900178254e-07, "loss": 0.0001, "reward": 0.08215625584125519, "reward_std": 0.13188451528549194, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08215624839067459, "step": 35 }, { "completion_length": 171.59375, "epoch": 0.019261637239165328, "grad_norm": 44.43064880371094, "kl": 0.14972423017024994, "learning_rate": 3.208556149732621e-07, "loss": 0.006, "reward": 0.17737498879432678, "reward_std": 0.2886289060115814, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.015625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09925000369548798, "step": 36 }, { "completion_length": 177.375, "epoch": 0.019796682718031033, "grad_norm": 1.4435793161392212, "kl": 0.0022582095116376877, "learning_rate": 3.297682709447415e-07, "loss": 0.0001, "reward": 0.49168747663497925, "reward_std": 0.6530897617340088, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.109375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.06981250643730164, "step": 37 }, { "completion_length": 184.5625, "epoch": 0.020331728196896735, "grad_norm": 0.8257693648338318, "kl": 0.0033723984379321337, "learning_rate": 3.3868092691622107e-07, "loss": 0.0001, "reward": 0.17868748307228088, "reward_std": 0.32020050287246704, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08493749797344208, "step": 38 }, { "completion_length": 175.65625, "epoch": 0.02086677367576244, "grad_norm": 0.8414753079414368, "kl": 0.002081364393234253, "learning_rate": 3.4759358288770056e-07, "loss": 0.0001, "reward": 0.10662499070167542, "reward_std": 0.1434936672449112, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.015625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09099999815225601, "step": 39 }, { "completion_length": 168.1875, "epoch": 0.021401819154628143, "grad_norm": 1.111526608467102, "kl": 0.0033324304968118668, "learning_rate": 3.5650623885918005e-07, "loss": 0.0001, "reward": 0.2227187603712082, "reward_std": 0.3658488392829895, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0977187529206276, "step": 40 }, { "completion_length": 170.53125, "epoch": 0.021936864633493848, "grad_norm": 2.0217812061309814, "kl": 0.03068654052913189, "learning_rate": 3.654188948306596e-07, "loss": 0.0012, "reward": 0.11081250011920929, "reward_std": 0.09971874952316284, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11081250011920929, "step": 41 }, { "completion_length": 179.875, "epoch": 0.02247191011235955, "grad_norm": 1.978493332862854, "kl": 0.0036891165655106306, "learning_rate": 3.7433155080213904e-07, "loss": 0.0001, "reward": 0.1586562544107437, "reward_std": 0.12468364834785461, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.046875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11178124696016312, "step": 42 }, { "completion_length": 183.1875, "epoch": 0.023006955591225255, "grad_norm": 0.9559652209281921, "kl": 0.0017528327880427241, "learning_rate": 3.8324420677361853e-07, "loss": 0.0001, "reward": 0.17859375476837158, "reward_std": 0.3393903076648712, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08484373986721039, "step": 43 }, { "completion_length": 169.0, "epoch": 0.023542001070090957, "grad_norm": 0.7829432487487793, "kl": 0.0022417190484702587, "learning_rate": 3.921568627450981e-07, "loss": 0.0001, "reward": 0.2914687395095825, "reward_std": 0.5298420190811157, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10396875441074371, "step": 44 }, { "completion_length": 174.6875, "epoch": 0.024077046548956663, "grad_norm": 0.9287885427474976, "kl": 0.0015788230812177062, "learning_rate": 4.0106951871657757e-07, "loss": 0.0001, "reward": 0.21799999475479126, "reward_std": 0.19750699400901794, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.09375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12424999475479126, "step": 45 }, { "completion_length": 169.875, "epoch": 0.024612092027822365, "grad_norm": 2.5593068599700928, "kl": 0.002411093097180128, "learning_rate": 4.099821746880571e-07, "loss": 0.0001, "reward": 0.11650000512599945, "reward_std": 0.15231230854988098, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.015625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10087499022483826, "step": 46 }, { "completion_length": 170.59375, "epoch": 0.02514713750668807, "grad_norm": 0.7311099171638489, "kl": 0.003140038810670376, "learning_rate": 4.188948306595366e-07, "loss": 0.0001, "reward": 0.27909374237060547, "reward_std": 0.5962870121002197, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.046875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.044718753546476364, "step": 47 }, { "completion_length": 175.28125, "epoch": 0.025682182985553772, "grad_norm": 0.5579654574394226, "kl": 0.0026314426213502884, "learning_rate": 4.2780748663101604e-07, "loss": 0.0001, "reward": 0.10309375077486038, "reward_std": 0.13821087777614594, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.015625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08746875822544098, "step": 48 }, { "completion_length": 177.6875, "epoch": 0.026217228464419477, "grad_norm": 2.170424699783325, "kl": 0.006701488047838211, "learning_rate": 4.367201426024956e-07, "loss": 0.0003, "reward": 0.08528125286102295, "reward_std": 0.09808943420648575, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08528125286102295, "step": 49 }, { "completion_length": 177.34375, "epoch": 0.02675227394328518, "grad_norm": 2.375812292098999, "kl": 0.005870660301297903, "learning_rate": 4.456327985739751e-07, "loss": 0.0002, "reward": 0.17537499964237213, "reward_std": 0.2745421528816223, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08162499964237213, "step": 50 }, { "completion_length": 186.0625, "epoch": 0.027287319422150885, "grad_norm": 0.9743713736534119, "kl": 0.003214301774278283, "learning_rate": 4.5454545454545457e-07, "loss": 0.0001, "reward": 0.07568749785423279, "reward_std": 0.15331195294857025, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.04443750157952309, "step": 51 }, { "completion_length": 177.15625, "epoch": 0.027822364901016586, "grad_norm": 687070848.0, "kl": 638828.6875, "learning_rate": 4.634581105169341e-07, "loss": 25553.1523, "reward": 0.11137500405311584, "reward_std": 0.1010788232088089, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11137500405311584, "step": 52 }, { "completion_length": 171.375, "epoch": 0.02835741037988229, "grad_norm": 1.2935677766799927, "kl": 0.003658934496343136, "learning_rate": 4.7237076648841356e-07, "loss": 0.0001, "reward": 0.44178125262260437, "reward_std": 0.8123147487640381, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.09375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09803125262260437, "step": 53 }, { "completion_length": 185.21875, "epoch": 0.028892455858747994, "grad_norm": 0.7025152444839478, "kl": 0.0024513474199920893, "learning_rate": 4.812834224598931e-07, "loss": 0.0001, "reward": 0.2238750010728836, "reward_std": 0.35869476199150085, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0988750010728836, "step": 54 }, { "completion_length": 179.90625, "epoch": 0.029427501337613696, "grad_norm": 0.4499192535877228, "kl": 0.002092120237648487, "learning_rate": 4.901960784313725e-07, "loss": 0.0001, "reward": 0.22628125548362732, "reward_std": 0.3754405975341797, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.07003124803304672, "step": 55 }, { "completion_length": 175.625, "epoch": 0.0299625468164794, "grad_norm": 2.1538944244384766, "kl": 0.0030568530783057213, "learning_rate": 4.991087344028521e-07, "loss": 0.0001, "reward": 0.09274999797344208, "reward_std": 0.1241927370429039, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09274999797344208, "step": 56 }, { "completion_length": 159.03125, "epoch": 0.030497592295345103, "grad_norm": 4.143250942230225, "kl": 0.04276318475604057, "learning_rate": 5.080213903743316e-07, "loss": 0.0017, "reward": 0.27143749594688416, "reward_std": 0.41626161336898804, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.046875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09956249594688416, "step": 57 }, { "completion_length": 187.90625, "epoch": 0.03103263777421081, "grad_norm": 20.024755477905273, "kl": 0.0944850891828537, "learning_rate": 5.169340463458112e-07, "loss": 0.0038, "reward": 0.0871874988079071, "reward_std": 0.059674378484487534, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0871874988079071, "step": 58 }, { "completion_length": 180.25, "epoch": 0.03156768325307651, "grad_norm": 1.6493560075759888, "kl": 0.007852243259549141, "learning_rate": 5.258467023172906e-07, "loss": 0.0003, "reward": 0.12718750536441803, "reward_std": 0.15477940440177917, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09593749046325684, "step": 59 }, { "completion_length": 188.09375, "epoch": 0.03210272873194221, "grad_norm": 1.67122483253479, "kl": 0.0018673643935471773, "learning_rate": 5.347593582887701e-07, "loss": 0.0001, "reward": 0.269406259059906, "reward_std": 0.39273780584335327, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.046875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09753125160932541, "step": 60 }, { "completion_length": 182.21875, "epoch": 0.03263777421080792, "grad_norm": 0.9040886759757996, "kl": 0.006689334288239479, "learning_rate": 5.436720142602496e-07, "loss": 0.0003, "reward": 0.15424999594688416, "reward_std": 0.5064101219177246, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.002000001259148121, "step": 61 }, { "completion_length": 173.8125, "epoch": 0.03317281968967362, "grad_norm": 0.9161165356636047, "kl": 0.007428353652358055, "learning_rate": 5.525846702317291e-07, "loss": 0.0003, "reward": 0.26350000500679016, "reward_std": 0.38183465600013733, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.07599999755620956, "step": 62 }, { "completion_length": 179.21875, "epoch": 0.033707865168539325, "grad_norm": 1.2794345617294312, "kl": 0.003915486857295036, "learning_rate": 5.614973262032086e-07, "loss": 0.0002, "reward": 0.24918749928474426, "reward_std": 0.402986079454422, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.046875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.07731249928474426, "step": 63 }, { "completion_length": 190.78125, "epoch": 0.03424291064740503, "grad_norm": 0.5601391792297363, "kl": 0.002833736827597022, "learning_rate": 5.70409982174688e-07, "loss": 0.0001, "reward": 0.15662500262260437, "reward_std": 0.13110655546188354, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09412500262260437, "step": 64 }, { "completion_length": 192.21875, "epoch": 0.034777956126270736, "grad_norm": 1.1248598098754883, "kl": 0.001826292253099382, "learning_rate": 5.793226381461676e-07, "loss": 0.0001, "reward": 0.11743749678134918, "reward_std": 0.1925003081560135, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.07056249678134918, "step": 65 }, { "completion_length": 163.875, "epoch": 0.03531300160513644, "grad_norm": 1.1078684329986572, "kl": 0.002804708434268832, "learning_rate": 5.882352941176471e-07, "loss": 0.0001, "reward": 0.3662499785423279, "reward_std": 0.4226645231246948, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.109375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.11625000089406967, "step": 66 }, { "completion_length": 182.34375, "epoch": 0.03584804708400214, "grad_norm": 1.4848650693893433, "kl": 0.0026009869761765003, "learning_rate": 5.971479500891267e-07, "loss": 0.0001, "reward": 0.08015625178813934, "reward_std": 0.08236134052276611, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08015624433755875, "step": 67 }, { "completion_length": 174.71875, "epoch": 0.03638309256286784, "grad_norm": 2.3324737548828125, "kl": 0.012466944754123688, "learning_rate": 6.060606060606061e-07, "loss": 0.0005, "reward": 0.1820937544107437, "reward_std": 0.3267054557800293, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.057093750685453415, "step": 68 }, { "completion_length": 176.4375, "epoch": 0.03691813804173355, "grad_norm": 0.9895151257514954, "kl": 0.001741687417961657, "learning_rate": 6.149732620320856e-07, "loss": 0.0001, "reward": 0.1887812614440918, "reward_std": 0.29811498522758484, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.015625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1106562465429306, "step": 69 }, { "completion_length": 177.15625, "epoch": 0.03745318352059925, "grad_norm": 0.8183426856994629, "kl": 0.005563116632401943, "learning_rate": 6.238859180035651e-07, "loss": 0.0002, "reward": 0.2529687285423279, "reward_std": 0.5214483141899109, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09671874344348907, "step": 70 }, { "completion_length": 177.125, "epoch": 0.037988228999464954, "grad_norm": 1.243821144104004, "kl": 0.002519675064831972, "learning_rate": 6.327985739750446e-07, "loss": 0.0001, "reward": 0.2603437304496765, "reward_std": 0.4845064878463745, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10409373790025711, "step": 71 }, { "completion_length": 164.84375, "epoch": 0.038523274478330656, "grad_norm": 2.235621929168701, "kl": 0.0037916633300483227, "learning_rate": 6.417112299465242e-07, "loss": 0.0002, "reward": 0.13118749856948853, "reward_std": 0.16712138056755066, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.046875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08431249856948853, "step": 72 }, { "completion_length": 171.4375, "epoch": 0.039058319957196365, "grad_norm": 1127758.375, "kl": 1981.2760009765625, "learning_rate": 6.506238859180036e-07, "loss": 79.251, "reward": 0.11303124576807022, "reward_std": 0.10777334868907928, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11303124576807022, "step": 73 }, { "completion_length": 177.84375, "epoch": 0.03959336543606207, "grad_norm": 0.8024388551712036, "kl": 0.0027657824102789164, "learning_rate": 6.59536541889483e-07, "loss": 0.0001, "reward": 0.1769999861717224, "reward_std": 0.20843049883842468, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1145000010728836, "step": 74 }, { "completion_length": 180.625, "epoch": 0.04012841091492777, "grad_norm": 0.8139789700508118, "kl": 0.0028207963332533836, "learning_rate": 6.684491978609627e-07, "loss": 0.0001, "reward": 0.15462499856948853, "reward_std": 0.34673988819122314, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.060874998569488525, "step": 75 }, { "completion_length": 193.75, "epoch": 0.04066345639379347, "grad_norm": 15.742280006408691, "kl": 0.02040281891822815, "learning_rate": 6.773618538324421e-07, "loss": 0.0008, "reward": 0.20084375143051147, "reward_std": 0.30213242769241333, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.046875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09146874397993088, "step": 76 }, { "completion_length": 187.125, "epoch": 0.04119850187265917, "grad_norm": 171242304.0, "kl": 13465841.0, "learning_rate": 6.862745098039217e-07, "loss": 538633.625, "reward": 0.06928125023841858, "reward_std": 0.11287619918584824, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.06928125023841858, "step": 77 }, { "completion_length": 187.59375, "epoch": 0.04173354735152488, "grad_norm": 1.073614478111267, "kl": 0.004617815837264061, "learning_rate": 6.951871657754011e-07, "loss": 0.0002, "reward": 0.05753124877810478, "reward_std": 0.17501652240753174, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.015625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.04190625250339508, "step": 78 }, { "completion_length": 165.6875, "epoch": 0.04226859283039058, "grad_norm": 2.2583389282226562, "kl": 0.005627387203276157, "learning_rate": 7.040998217468806e-07, "loss": 0.0002, "reward": 0.12209375202655792, "reward_std": 0.07349376380443573, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12209375202655792, "step": 79 }, { "completion_length": 178.90625, "epoch": 0.042803638309256285, "grad_norm": 1.1999577283859253, "kl": 0.0025825011543929577, "learning_rate": 7.130124777183601e-07, "loss": 0.0001, "reward": 0.09443749487400055, "reward_std": 0.2040117233991623, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.06318750232458115, "step": 80 }, { "completion_length": 163.75, "epoch": 0.04333868378812199, "grad_norm": 2.6533455848693848, "kl": 0.0024711950682103634, "learning_rate": 7.219251336898397e-07, "loss": 0.0001, "reward": 0.20518749952316284, "reward_std": 0.3593289256095886, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11143749952316284, "step": 81 }, { "completion_length": 181.8125, "epoch": 0.043873729266987696, "grad_norm": 2.1904473304748535, "kl": 0.002009930554777384, "learning_rate": 7.308377896613192e-07, "loss": 0.0001, "reward": 0.152281254529953, "reward_std": 0.39298689365386963, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.046875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.042906254529953, "step": 82 }, { "completion_length": 192.0, "epoch": 0.0444087747458534, "grad_norm": 48690.41015625, "kl": 15.175191879272461, "learning_rate": 7.397504456327986e-07, "loss": 0.607, "reward": 0.10896874964237213, "reward_std": 0.13817524909973145, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.07771874964237213, "step": 83 }, { "completion_length": 195.53125, "epoch": 0.0449438202247191, "grad_norm": 0.6525951027870178, "kl": 0.002320118248462677, "learning_rate": 7.486631016042781e-07, "loss": 0.0001, "reward": 0.08837500214576721, "reward_std": 0.1123420000076294, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.015625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.07275000214576721, "step": 84 }, { "completion_length": 177.0, "epoch": 0.0454788657035848, "grad_norm": 0.8931333422660828, "kl": 0.002041170373558998, "learning_rate": 7.575757575757576e-07, "loss": 0.0001, "reward": 0.22824999690055847, "reward_std": 0.5326303839683533, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.07199999690055847, "step": 85 }, { "completion_length": 163.9375, "epoch": 0.04601391118245051, "grad_norm": 3.0162289142608643, "kl": 0.002566506154835224, "learning_rate": 7.664884135472371e-07, "loss": 0.0001, "reward": 0.0769062489271164, "reward_std": 0.111920066177845, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0769062489271164, "step": 86 }, { "completion_length": 185.46875, "epoch": 0.04654895666131621, "grad_norm": 0.8581781387329102, "kl": 0.002193812280893326, "learning_rate": 7.754010695187167e-07, "loss": 0.0001, "reward": 0.18415625393390656, "reward_std": 0.2972266376018524, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.015625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10603125393390656, "step": 87 }, { "completion_length": 186.53125, "epoch": 0.047084002140181914, "grad_norm": 2.3058059215545654, "kl": 0.004540222696959972, "learning_rate": 7.843137254901962e-07, "loss": 0.0002, "reward": 0.0989999994635582, "reward_std": 0.08513141423463821, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0989999920129776, "step": 88 }, { "completion_length": 189.34375, "epoch": 0.047619047619047616, "grad_norm": 0.9769949316978455, "kl": 0.0023076902143657207, "learning_rate": 7.932263814616756e-07, "loss": 0.0001, "reward": 0.04731249809265137, "reward_std": 0.1221042200922966, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.04731249809265137, "step": 89 }, { "completion_length": 185.1875, "epoch": 0.048154093097913325, "grad_norm": 1.5098062753677368, "kl": 0.0024107606150209904, "learning_rate": 8.021390374331551e-07, "loss": 0.0001, "reward": 0.24031248688697815, "reward_std": 0.40069761872291565, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.046875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.06843750178813934, "step": 90 }, { "completion_length": 172.78125, "epoch": 0.04868913857677903, "grad_norm": 59.30844497680664, "kl": 0.3879415690898895, "learning_rate": 8.110516934046346e-07, "loss": 0.0155, "reward": 0.11496874690055847, "reward_std": 0.11583638936281204, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.015625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09934374690055847, "step": 91 }, { "completion_length": 185.0625, "epoch": 0.04922418405564473, "grad_norm": 1.3063592910766602, "kl": 0.004073680378496647, "learning_rate": 8.199643493761142e-07, "loss": 0.0002, "reward": 0.08890624344348907, "reward_std": 0.07560936361551285, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08890624344348907, "step": 92 }, { "completion_length": 185.4375, "epoch": 0.04975922953451043, "grad_norm": 1.5774999856948853, "kl": 0.003611853579059243, "learning_rate": 8.288770053475937e-07, "loss": 0.0001, "reward": 0.45784375071525574, "reward_std": 0.7271761894226074, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.109375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09846875071525574, "step": 93 }, { "completion_length": 187.46875, "epoch": 0.05029427501337614, "grad_norm": 1.909483551979065, "kl": 0.004188550636172295, "learning_rate": 8.377896613190732e-07, "loss": 0.0002, "reward": 0.10321874916553497, "reward_std": 0.11523078382015228, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.015625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08759375661611557, "step": 94 }, { "completion_length": 169.125, "epoch": 0.05082932049224184, "grad_norm": 0.932265043258667, "kl": 0.0028632329776883125, "learning_rate": 8.467023172905526e-07, "loss": 0.0001, "reward": 0.42934373021125793, "reward_std": 0.4731653332710266, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11684375256299973, "step": 95 }, { "completion_length": 182.78125, "epoch": 0.051364365971107544, "grad_norm": 1.4439777135849, "kl": 0.0021854452788829803, "learning_rate": 8.556149732620321e-07, "loss": 0.0001, "reward": 0.16587500274181366, "reward_std": 0.38108396530151367, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.015625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.07212499529123306, "step": 96 }, { "completion_length": 171.71875, "epoch": 0.051899411449973246, "grad_norm": 2.2285075187683105, "kl": 0.00274151423946023, "learning_rate": 8.645276292335117e-07, "loss": 0.0001, "reward": 0.29862502217292786, "reward_std": 0.3761540651321411, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.046875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12674999237060547, "step": 97 }, { "completion_length": 169.125, "epoch": 0.052434456928838954, "grad_norm": 0.7305043935775757, "kl": 0.004896554630249739, "learning_rate": 8.734402852049912e-07, "loss": 0.0002, "reward": 0.40059375762939453, "reward_std": 0.5604589581489563, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.09375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11934375762939453, "step": 98 }, { "completion_length": 184.28125, "epoch": 0.052969502407704656, "grad_norm": 1.1535252332687378, "kl": 0.0022688712924718857, "learning_rate": 8.823529411764707e-07, "loss": 0.0001, "reward": 0.11696875095367432, "reward_std": 0.09862823784351349, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11696875095367432, "step": 99 }, { "completion_length": 189.4375, "epoch": 0.05350454788657036, "grad_norm": 1.0599831342697144, "kl": 0.004714666865766048, "learning_rate": 8.912655971479502e-07, "loss": 0.0002, "reward": 0.1731249988079071, "reward_std": 0.3259737491607666, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0793749988079071, "step": 100 }, { "completion_length": 181.15625, "epoch": 0.05403959336543606, "grad_norm": 2.229004144668579, "kl": 0.003555197501555085, "learning_rate": 9.001782531194296e-07, "loss": 0.0001, "reward": 0.14112499356269836, "reward_std": 0.16846391558647156, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.046875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09425000101327896, "step": 101 }, { "completion_length": 182.84375, "epoch": 0.05457463884430177, "grad_norm": 0.9302679896354675, "kl": 0.0018388006137683988, "learning_rate": 9.090909090909091e-07, "loss": 0.0001, "reward": 0.2813437283039093, "reward_std": 0.4475325345993042, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.046875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.04696875065565109, "step": 102 }, { "completion_length": 181.34375, "epoch": 0.05510968432316747, "grad_norm": 2.2122204303741455, "kl": 0.009882904589176178, "learning_rate": 9.180035650623886e-07, "loss": 0.0004, "reward": 0.17468750476837158, "reward_std": 0.3328070342540741, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08093750476837158, "step": 103 }, { "completion_length": 160.25, "epoch": 0.05564472980203317, "grad_norm": 0.8461737036705017, "kl": 0.0018024707678705454, "learning_rate": 9.269162210338682e-07, "loss": 0.0001, "reward": 0.4247500002384186, "reward_std": 0.5938481092453003, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11225000023841858, "step": 104 }, { "completion_length": 185.21875, "epoch": 0.056179775280898875, "grad_norm": 2.4756085872650146, "kl": 0.0026238500140607357, "learning_rate": 9.358288770053477e-07, "loss": 0.0001, "reward": 0.4346562623977661, "reward_std": 0.6421124339103699, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.09375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09090624749660492, "step": 105 }, { "completion_length": 179.75, "epoch": 0.05671482075976458, "grad_norm": 1.0667450428009033, "kl": 0.003565003164112568, "learning_rate": 9.447415329768271e-07, "loss": 0.0001, "reward": 0.26618751883506775, "reward_std": 0.3239186406135559, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10993750393390656, "step": 106 }, { "completion_length": 182.40625, "epoch": 0.057249866238630286, "grad_norm": 1.8783053159713745, "kl": 0.002864703070372343, "learning_rate": 9.536541889483067e-07, "loss": 0.0001, "reward": 0.3707812428474426, "reward_std": 0.36646801233291626, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.078125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10515625029802322, "step": 107 }, { "completion_length": 180.90625, "epoch": 0.05778491171749599, "grad_norm": 1.5894142389297485, "kl": 0.0025094833690673113, "learning_rate": 9.625668449197862e-07, "loss": 0.0001, "reward": 0.10884374380111694, "reward_std": 0.0855424553155899, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10884375870227814, "step": 108 }, { "completion_length": 189.59375, "epoch": 0.05831995719636169, "grad_norm": 0.6841498017311096, "kl": 0.002183490199968219, "learning_rate": 9.714795008912657e-07, "loss": 0.0001, "reward": 0.10859374701976776, "reward_std": 0.14742827415466309, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.015625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09296874701976776, "step": 109 }, { "completion_length": 178.3125, "epoch": 0.05885500267522739, "grad_norm": 1.1865333318710327, "kl": 0.0030986256897449493, "learning_rate": 9.80392156862745e-07, "loss": 0.0001, "reward": 0.04546874761581421, "reward_std": 0.11854679882526398, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.04546874761581421, "step": 110 }, { "completion_length": 180.21875, "epoch": 0.0593900481540931, "grad_norm": 1.2976666688919067, "kl": 0.03093932755291462, "learning_rate": 9.893048128342248e-07, "loss": 0.0012, "reward": 0.0871562510728836, "reward_std": 0.12151187658309937, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0871562510728836, "step": 111 }, { "completion_length": 177.53125, "epoch": 0.0599250936329588, "grad_norm": 5.121525764465332, "kl": 0.026836980134248734, "learning_rate": 9.982174688057042e-07, "loss": 0.0011, "reward": 0.05434374883770943, "reward_std": 0.134809672832489, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.05434374883770943, "step": 112 }, { "completion_length": 179.25, "epoch": 0.060460139111824504, "grad_norm": 1.2791961431503296, "kl": 0.006463521625846624, "learning_rate": 1.0071301247771837e-06, "loss": 0.0003, "reward": 0.3519687056541443, "reward_std": 0.44792306423187256, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10196875035762787, "step": 113 }, { "completion_length": 193.21875, "epoch": 0.060995184590690206, "grad_norm": 2.9094064235687256, "kl": 0.0043749636970460415, "learning_rate": 1.0160427807486633e-06, "loss": 0.0002, "reward": 0.16496872901916504, "reward_std": 0.30048638582229614, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.07121874392032623, "step": 114 }, { "completion_length": 179.0, "epoch": 0.061530230069555915, "grad_norm": 0.851912796497345, "kl": 0.003761256579309702, "learning_rate": 1.0249554367201426e-06, "loss": 0.0002, "reward": 0.30287501215934753, "reward_std": 0.3570297956466675, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.078125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09974999725818634, "step": 115 }, { "completion_length": 191.46875, "epoch": 0.06206527554842162, "grad_norm": 12.566967010498047, "kl": 0.20301944017410278, "learning_rate": 1.0338680926916224e-06, "loss": 0.0081, "reward": 0.11012500524520874, "reward_std": 0.11535286158323288, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.07887499779462814, "step": 116 }, { "completion_length": 184.28125, "epoch": 0.06260032102728733, "grad_norm": 1.7212048768997192, "kl": 0.0034116790629923344, "learning_rate": 1.0427807486631017e-06, "loss": 0.0001, "reward": 0.31978124380111694, "reward_std": 0.5576620101928711, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.078125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.05415625125169754, "step": 117 }, { "completion_length": 171.9375, "epoch": 0.06313536650615302, "grad_norm": 10.532801628112793, "kl": 0.03603391721844673, "learning_rate": 1.0516934046345812e-06, "loss": 0.0014, "reward": 0.19228124618530273, "reward_std": 0.327519029378891, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09853124618530273, "step": 118 }, { "completion_length": 193.375, "epoch": 0.06367041198501873, "grad_norm": 1.5193909406661987, "kl": 0.0027132052928209305, "learning_rate": 1.0606060606060608e-06, "loss": 0.0001, "reward": 0.1250937581062317, "reward_std": 0.09520626068115234, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09384375810623169, "step": 119 }, { "completion_length": 174.0625, "epoch": 0.06420545746388442, "grad_norm": 1.1305453777313232, "kl": 0.003308475250378251, "learning_rate": 1.0695187165775401e-06, "loss": 0.0001, "reward": 0.1172500029206276, "reward_std": 0.15681493282318115, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.101624995470047, "step": 120 }, { "completion_length": 182.75, "epoch": 0.06474050294275013, "grad_norm": 2.019829273223877, "kl": 0.02029380388557911, "learning_rate": 1.0784313725490197e-06, "loss": 0.0008, "reward": 0.052531249821186066, "reward_std": 0.1248614713549614, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.05253124609589577, "step": 121 }, { "completion_length": 190.625, "epoch": 0.06527554842161584, "grad_norm": 1.410351037979126, "kl": 0.014345349743962288, "learning_rate": 1.0873440285204992e-06, "loss": 0.0006, "reward": 0.11056250333786011, "reward_std": 0.10942798852920532, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.015625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09493750333786011, "step": 122 }, { "completion_length": 180.28125, "epoch": 0.06581059390048154, "grad_norm": 1.9351345300674438, "kl": 0.0044614216312766075, "learning_rate": 1.0962566844919787e-06, "loss": 0.0002, "reward": 0.20415624976158142, "reward_std": 0.29337427020072937, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11040625721216202, "step": 123 }, { "completion_length": 180.0625, "epoch": 0.06634563937934725, "grad_norm": 0.5759527683258057, "kl": 0.004589931108057499, "learning_rate": 1.1051693404634583e-06, "loss": 0.0002, "reward": 0.23362500965595245, "reward_std": 0.08360370993614197, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10862500220537186, "step": 124 }, { "completion_length": 184.875, "epoch": 0.06688068485821295, "grad_norm": 1.3053683042526245, "kl": 0.005052396561950445, "learning_rate": 1.1140819964349376e-06, "loss": 0.0002, "reward": 0.10534375160932541, "reward_std": 0.05500267818570137, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10534374415874481, "step": 125 }, { "completion_length": 171.78125, "epoch": 0.06741573033707865, "grad_norm": 15.257534980773926, "kl": 0.047724273055791855, "learning_rate": 1.1229946524064172e-06, "loss": 0.0019, "reward": 0.39140623807907104, "reward_std": 0.4307621121406555, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.09375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11015624552965164, "step": 126 }, { "completion_length": 178.625, "epoch": 0.06795077581594436, "grad_norm": 2.0709662437438965, "kl": 0.005100180860608816, "learning_rate": 1.1319073083778967e-06, "loss": 0.0002, "reward": 0.0715312510728836, "reward_std": 0.12375873327255249, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0715312510728836, "step": 127 }, { "completion_length": 177.90625, "epoch": 0.06848582129481005, "grad_norm": 3.757835865020752, "kl": 0.014601380564272404, "learning_rate": 1.140819964349376e-06, "loss": 0.0006, "reward": 0.13378125429153442, "reward_std": 0.1182972639799118, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.015625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11815625429153442, "step": 128 }, { "completion_length": 190.3125, "epoch": 0.06902086677367576, "grad_norm": 1.2251341342926025, "kl": 0.0027984660118818283, "learning_rate": 1.1497326203208558e-06, "loss": 0.0001, "reward": 0.047343749552965164, "reward_std": 0.1099495142698288, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.047343749552965164, "step": 129 }, { "completion_length": 188.03125, "epoch": 0.06955591225254147, "grad_norm": 1.5232963562011719, "kl": 0.01044206228107214, "learning_rate": 1.1586452762923351e-06, "loss": 0.0004, "reward": 0.12474999576807022, "reward_std": 0.09424008429050446, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.015625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10912500321865082, "step": 130 }, { "completion_length": 182.0625, "epoch": 0.07009095773140717, "grad_norm": 1.7230414152145386, "kl": 0.005438798572868109, "learning_rate": 1.1675579322638147e-06, "loss": 0.0002, "reward": 0.15987500548362732, "reward_std": 0.295952707529068, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.015625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.06612499803304672, "step": 131 }, { "completion_length": 188.53125, "epoch": 0.07062600321027288, "grad_norm": 1.253174066543579, "kl": 0.004047672264277935, "learning_rate": 1.1764705882352942e-06, "loss": 0.0002, "reward": 0.08762499690055847, "reward_std": 0.08589018881320953, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08762500435113907, "step": 132 }, { "completion_length": 174.1875, "epoch": 0.07116104868913857, "grad_norm": 2.8104865550994873, "kl": 0.006958743091672659, "learning_rate": 1.1853832442067738e-06, "loss": 0.0003, "reward": 0.3022187650203705, "reward_std": 0.3700447976589203, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.046875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1303437501192093, "step": 133 }, { "completion_length": 183.625, "epoch": 0.07169609416800428, "grad_norm": 0.8546741008758545, "kl": 0.005257762037217617, "learning_rate": 1.1942959001782533e-06, "loss": 0.0002, "reward": 0.0949687510728836, "reward_std": 0.16865192353725433, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0637187510728836, "step": 134 }, { "completion_length": 172.3125, "epoch": 0.07223113964686999, "grad_norm": 47.627166748046875, "kl": 0.32647228240966797, "learning_rate": 1.2032085561497326e-06, "loss": 0.0131, "reward": 0.043781254440546036, "reward_std": 0.13645333051681519, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.04378125071525574, "step": 135 }, { "completion_length": 183.40625, "epoch": 0.07276618512573568, "grad_norm": 0.7891777157783508, "kl": 0.003703095717355609, "learning_rate": 1.2121212121212122e-06, "loss": 0.0001, "reward": 0.24281249940395355, "reward_std": 0.36673298478126526, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.046875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.07093749940395355, "step": 136 }, { "completion_length": 180.125, "epoch": 0.07330123060460139, "grad_norm": 1.5897918939590454, "kl": 0.005954062566161156, "learning_rate": 1.2210338680926917e-06, "loss": 0.0002, "reward": 0.3826874792575836, "reward_std": 0.4391295909881592, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.11706249415874481, "step": 137 }, { "completion_length": 175.15625, "epoch": 0.0738362760834671, "grad_norm": 1.0346181392669678, "kl": 0.003735364880412817, "learning_rate": 1.2299465240641713e-06, "loss": 0.0001, "reward": 0.11434374749660492, "reward_std": 0.06996370851993561, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11434375494718552, "step": 138 }, { "completion_length": 176.40625, "epoch": 0.0743713215623328, "grad_norm": 3.242959499359131, "kl": 0.09526674449443817, "learning_rate": 1.2388591800356508e-06, "loss": 0.0038, "reward": 0.24943749606609344, "reward_std": 0.33800214529037476, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.09375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09318750351667404, "step": 139 }, { "completion_length": 180.09375, "epoch": 0.0749063670411985, "grad_norm": 2.883775234222412, "kl": 0.007307551335543394, "learning_rate": 1.2477718360071302e-06, "loss": 0.0003, "reward": 0.3437812328338623, "reward_std": 0.636216402053833, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0312812514603138, "step": 140 }, { "completion_length": 189.65625, "epoch": 0.0754414125200642, "grad_norm": 2.0935957431793213, "kl": 0.004123710561543703, "learning_rate": 1.2566844919786097e-06, "loss": 0.0002, "reward": 0.1289999932050705, "reward_std": 0.12939050793647766, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0977499932050705, "step": 141 }, { "completion_length": 174.46875, "epoch": 0.07597645799892991, "grad_norm": 519761.40625, "kl": 5838.798828125, "learning_rate": 1.2655971479500893e-06, "loss": 233.552, "reward": 0.218812495470047, "reward_std": 0.3966822624206543, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.062562495470047, "step": 142 }, { "completion_length": 179.25, "epoch": 0.07651150347779562, "grad_norm": 1.7091304063796997, "kl": 0.0035021663643419743, "learning_rate": 1.2745098039215686e-06, "loss": 0.0001, "reward": 0.17290624976158142, "reward_std": 0.28861358761787415, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.07915625721216202, "step": 143 }, { "completion_length": 169.46875, "epoch": 0.07704654895666131, "grad_norm": 1.3560471534729004, "kl": 0.009054142981767654, "learning_rate": 1.2834224598930483e-06, "loss": 0.0004, "reward": 0.16428124904632568, "reward_std": 0.1421152651309967, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.046875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11740624904632568, "step": 144 }, { "completion_length": 183.4375, "epoch": 0.07758159443552702, "grad_norm": 1.1129016876220703, "kl": 0.00518418662250042, "learning_rate": 1.2923351158645279e-06, "loss": 0.0002, "reward": 0.1525000035762787, "reward_std": 0.11942566186189651, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12125000357627869, "step": 145 }, { "completion_length": 156.1875, "epoch": 0.07811663991439273, "grad_norm": 2.249830484390259, "kl": 0.024319469928741455, "learning_rate": 1.3012477718360072e-06, "loss": 0.001, "reward": 0.2553125023841858, "reward_std": 0.2699525058269501, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.140625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11468750238418579, "step": 146 }, { "completion_length": 161.15625, "epoch": 0.07865168539325842, "grad_norm": 7172183.5, "kl": 39672.09765625, "learning_rate": 1.3101604278074868e-06, "loss": 1586.8839, "reward": 0.1264062523841858, "reward_std": 0.07585255801677704, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1264062523841858, "step": 147 }, { "completion_length": 168.84375, "epoch": 0.07918673087212413, "grad_norm": 29.476999282836914, "kl": 1.264482021331787, "learning_rate": 1.319073083778966e-06, "loss": 0.0506, "reward": 0.21556249260902405, "reward_std": 0.28633421659469604, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12181250751018524, "step": 148 }, { "completion_length": 182.5, "epoch": 0.07972177635098983, "grad_norm": 1.5422297716140747, "kl": 0.011974995955824852, "learning_rate": 1.3279857397504459e-06, "loss": 0.0005, "reward": 0.07828125357627869, "reward_std": 0.12109462171792984, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.07828125357627869, "step": 149 }, { "completion_length": 183.125, "epoch": 0.08025682182985554, "grad_norm": 1.9570101499557495, "kl": 0.004141484387218952, "learning_rate": 1.3368983957219254e-06, "loss": 0.0002, "reward": 0.08571875095367432, "reward_std": 0.11709937453269958, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08571875095367432, "step": 150 }, { "completion_length": 186.65625, "epoch": 0.08079186730872125, "grad_norm": 1.9544771909713745, "kl": 0.07949696481227875, "learning_rate": 1.3458110516934047e-06, "loss": 0.0032, "reward": 0.10506249964237213, "reward_std": 0.11634407192468643, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.015625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08943749964237213, "step": 151 }, { "completion_length": 180.71875, "epoch": 0.08132691278758694, "grad_norm": 3.476135492324829, "kl": 0.13472065329551697, "learning_rate": 1.3547237076648843e-06, "loss": 0.0054, "reward": 0.15312498807907104, "reward_std": 0.16957074403762817, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09062500298023224, "step": 152 }, { "completion_length": 189.40625, "epoch": 0.08186195826645265, "grad_norm": 0.7911190390586853, "kl": 0.004223458934575319, "learning_rate": 1.3636363636363636e-06, "loss": 0.0002, "reward": 0.19828125834465027, "reward_std": 0.3495638072490692, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.07328125089406967, "step": 153 }, { "completion_length": 178.1875, "epoch": 0.08239700374531835, "grad_norm": 1.3410987854003906, "kl": 0.0026781996712088585, "learning_rate": 1.3725490196078434e-06, "loss": 0.0001, "reward": 0.2720625102519989, "reward_std": 0.3294582962989807, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.109375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10018749535083771, "step": 154 }, { "completion_length": 178.6875, "epoch": 0.08293204922418405, "grad_norm": 2.757493495941162, "kl": 0.008807975798845291, "learning_rate": 1.381461675579323e-06, "loss": 0.0004, "reward": 0.12700000405311584, "reward_std": 0.09957610070705414, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12700000405311584, "step": 155 }, { "completion_length": 184.28125, "epoch": 0.08346709470304976, "grad_norm": 8.4693021774292, "kl": 0.051532525569200516, "learning_rate": 1.3903743315508022e-06, "loss": 0.0021, "reward": 0.12174999713897705, "reward_std": 0.16196087002754211, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09049999713897705, "step": 156 }, { "completion_length": 166.78125, "epoch": 0.08400214018191546, "grad_norm": 0.9092446565628052, "kl": 0.0075800782069563866, "learning_rate": 1.3992869875222818e-06, "loss": 0.0003, "reward": 0.20137499272823334, "reward_std": 0.35429301857948303, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.046875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09200000017881393, "step": 157 }, { "completion_length": 170.78125, "epoch": 0.08453718566078117, "grad_norm": 1.4076294898986816, "kl": 0.004662873223423958, "learning_rate": 1.4081996434937611e-06, "loss": 0.0002, "reward": 0.7049687504768372, "reward_std": 0.8995665907859802, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.140625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12684375047683716, "step": 158 }, { "completion_length": 172.34375, "epoch": 0.08507223113964688, "grad_norm": 1.434618353843689, "kl": 0.0056721121072769165, "learning_rate": 1.4171122994652409e-06, "loss": 0.0002, "reward": 0.08809375017881393, "reward_std": 0.09439639002084732, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08809375017881393, "step": 159 }, { "completion_length": 175.25, "epoch": 0.08560727661851257, "grad_norm": 1.5290415287017822, "kl": 0.00822750385850668, "learning_rate": 1.4260249554367202e-06, "loss": 0.0003, "reward": 0.11740624904632568, "reward_std": 0.10980755090713501, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.015625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10178125649690628, "step": 160 }, { "completion_length": 173.34375, "epoch": 0.08614232209737828, "grad_norm": 150747693056.0, "kl": 407420960.0, "learning_rate": 1.4349376114081998e-06, "loss": 16296838.0, "reward": 0.11731249839067459, "reward_std": 0.09265105426311493, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.117312490940094, "step": 161 }, { "completion_length": 172.65625, "epoch": 0.08667736757624397, "grad_norm": 3.529024124145508, "kl": 0.03340981528162956, "learning_rate": 1.4438502673796793e-06, "loss": 0.0013, "reward": 0.1341562420129776, "reward_std": 0.15739436447620392, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1029062569141388, "step": 162 }, { "completion_length": 176.09375, "epoch": 0.08721241305510968, "grad_norm": 3.119518756866455, "kl": 0.004711393732577562, "learning_rate": 1.4527629233511586e-06, "loss": 0.0002, "reward": 0.0976874977350235, "reward_std": 0.14852885901927948, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0976874977350235, "step": 163 }, { "completion_length": 190.3125, "epoch": 0.08774745853397539, "grad_norm": 0.6541728973388672, "kl": 0.008747007697820663, "learning_rate": 1.4616755793226384e-06, "loss": 0.0003, "reward": 0.11765624582767487, "reward_std": 0.11284056305885315, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08640625327825546, "step": 164 }, { "completion_length": 183.5625, "epoch": 0.08828250401284109, "grad_norm": 1.8309099674224854, "kl": 0.02304363250732422, "learning_rate": 1.4705882352941177e-06, "loss": 0.0009, "reward": 0.33784377574920654, "reward_std": 0.6037758588790894, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08784374594688416, "step": 165 }, { "completion_length": 180.75, "epoch": 0.0888175494917068, "grad_norm": 0.6208946704864502, "kl": 0.005812976974993944, "learning_rate": 1.4795008912655973e-06, "loss": 0.0002, "reward": 0.15278124809265137, "reward_std": 0.37605053186416626, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.046875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.04340624809265137, "step": 166 }, { "completion_length": 186.625, "epoch": 0.0893525949705725, "grad_norm": 1.1830217838287354, "kl": 0.004520305432379246, "learning_rate": 1.4884135472370766e-06, "loss": 0.0002, "reward": 0.1615000069141388, "reward_std": 0.284692645072937, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.015625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0833749994635582, "step": 167 }, { "completion_length": 178.8125, "epoch": 0.0898876404494382, "grad_norm": 1.5777482986450195, "kl": 0.007111437618732452, "learning_rate": 1.4973262032085562e-06, "loss": 0.0003, "reward": 0.11349999904632568, "reward_std": 0.08288140594959259, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.015625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09787499904632568, "step": 168 }, { "completion_length": 173.15625, "epoch": 0.09042268592830391, "grad_norm": 1.849491000175476, "kl": 0.006594131700694561, "learning_rate": 1.506238859180036e-06, "loss": 0.0003, "reward": 0.06428125500679016, "reward_std": 0.13083116710186005, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.06428125500679016, "step": 169 }, { "completion_length": 169.9375, "epoch": 0.0909577314071696, "grad_norm": 0.6856806874275208, "kl": 0.005578930489718914, "learning_rate": 1.5151515151515152e-06, "loss": 0.0002, "reward": 0.39381250739097595, "reward_std": 0.5812713503837585, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.09375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11256250739097595, "step": 170 }, { "completion_length": 180.3125, "epoch": 0.09149277688603531, "grad_norm": 1.1829952001571655, "kl": 0.00846106093376875, "learning_rate": 1.5240641711229948e-06, "loss": 0.0003, "reward": 0.36390626430511475, "reward_std": 0.3865428864955902, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11390624940395355, "step": 171 }, { "completion_length": 175.0625, "epoch": 0.09202782236490102, "grad_norm": 1.157149314880371, "kl": 0.008840767666697502, "learning_rate": 1.5329768270944741e-06, "loss": 0.0004, "reward": 0.1588124930858612, "reward_std": 0.1641991138458252, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0963125079870224, "step": 172 }, { "completion_length": 181.03125, "epoch": 0.09256286784376672, "grad_norm": 2.6525609493255615, "kl": 0.00810306053608656, "learning_rate": 1.5418894830659537e-06, "loss": 0.0003, "reward": 0.15256249904632568, "reward_std": 0.15415164828300476, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09006249904632568, "step": 173 }, { "completion_length": 170.65625, "epoch": 0.09309791332263243, "grad_norm": 0.7046040892601013, "kl": 0.008506656624376774, "learning_rate": 1.5508021390374334e-06, "loss": 0.0003, "reward": 0.3333125114440918, "reward_std": 0.4572436809539795, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.109375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0989374965429306, "step": 174 }, { "completion_length": 170.53125, "epoch": 0.09363295880149813, "grad_norm": 2.3294055461883545, "kl": 0.009941745549440384, "learning_rate": 1.5597147950089128e-06, "loss": 0.0004, "reward": 0.48875001072883606, "reward_std": 0.6254594326019287, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.078125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09812499582767487, "step": 175 }, { "completion_length": 181.4375, "epoch": 0.09416800428036383, "grad_norm": 2.153499126434326, "kl": 0.005750279873609543, "learning_rate": 1.5686274509803923e-06, "loss": 0.0002, "reward": 0.06996875256299973, "reward_std": 0.0903455913066864, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.06996874511241913, "step": 176 }, { "completion_length": 183.5625, "epoch": 0.09470304975922954, "grad_norm": 1.9827061891555786, "kl": 0.018909700214862823, "learning_rate": 1.5775401069518716e-06, "loss": 0.0008, "reward": 0.06709375232458115, "reward_std": 0.12330427765846252, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.06709374487400055, "step": 177 }, { "completion_length": 172.65625, "epoch": 0.09523809523809523, "grad_norm": 4475709.5, "kl": 6535.67431640625, "learning_rate": 1.5864527629233512e-06, "loss": 261.427, "reward": 0.3024374842643738, "reward_std": 0.4821438193321228, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.046875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13056249916553497, "step": 178 }, { "completion_length": 168.78125, "epoch": 0.09577314071696094, "grad_norm": 1.6190558671951294, "kl": 0.012868589721620083, "learning_rate": 1.595365418894831e-06, "loss": 0.0005, "reward": 0.21703124046325684, "reward_std": 0.5079001188278198, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.06078124791383743, "step": 179 }, { "completion_length": 179.125, "epoch": 0.09630818619582665, "grad_norm": 1.0022549629211426, "kl": 0.01392733957618475, "learning_rate": 1.6042780748663103e-06, "loss": 0.0006, "reward": 0.24937500059604645, "reward_std": 0.39151573181152344, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.109375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.07750000059604645, "step": 180 }, { "completion_length": 155.125, "epoch": 0.09684323167469235, "grad_norm": 26574.27734375, "kl": 7.674831867218018, "learning_rate": 1.6131907308377898e-06, "loss": 0.307, "reward": 0.33559373021125793, "reward_std": 0.5189287066459656, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14809376001358032, "step": 181 }, { "completion_length": 178.34375, "epoch": 0.09737827715355805, "grad_norm": 3.349975824356079, "kl": 0.012146905064582825, "learning_rate": 1.6221033868092691e-06, "loss": 0.0005, "reward": 0.31806251406669617, "reward_std": 0.5479229092597961, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13056249916553497, "step": 182 }, { "completion_length": 184.0625, "epoch": 0.09791332263242375, "grad_norm": 1.8612773418426514, "kl": 0.013270605355501175, "learning_rate": 1.631016042780749e-06, "loss": 0.0005, "reward": 0.15571875870227814, "reward_std": 0.32077527046203613, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.015625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.07759374380111694, "step": 183 }, { "completion_length": 182.0, "epoch": 0.09844836811128946, "grad_norm": 6.81226921081543, "kl": 0.04789917916059494, "learning_rate": 1.6399286987522285e-06, "loss": 0.0019, "reward": 0.12384374439716339, "reward_std": 0.05214359238743782, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12384375929832458, "step": 184 }, { "completion_length": 180.375, "epoch": 0.09898341359015517, "grad_norm": 1.1535921096801758, "kl": 0.01392707321792841, "learning_rate": 1.6488413547237078e-06, "loss": 0.0006, "reward": 0.19346876442432404, "reward_std": 0.33403149247169495, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09971874952316284, "step": 185 }, { "completion_length": 187.9375, "epoch": 0.09951845906902086, "grad_norm": 0.7143915295600891, "kl": 0.007148392964154482, "learning_rate": 1.6577540106951873e-06, "loss": 0.0003, "reward": 0.1499062478542328, "reward_std": 0.15214651823043823, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.046875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10303124785423279, "step": 186 }, { "completion_length": 162.15625, "epoch": 0.10005350454788657, "grad_norm": 1.8157223463058472, "kl": 0.021965306252241135, "learning_rate": 1.6666666666666667e-06, "loss": 0.0009, "reward": 0.45478126406669617, "reward_std": 0.3828459084033966, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14228124916553497, "step": 187 }, { "completion_length": 184.09375, "epoch": 0.10058855002675228, "grad_norm": 3.2038891315460205, "kl": 0.04941853880882263, "learning_rate": 1.6755793226381464e-06, "loss": 0.002, "reward": 0.19574999809265137, "reward_std": 0.34370940923690796, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10199999809265137, "step": 188 }, { "completion_length": 179.5, "epoch": 0.10112359550561797, "grad_norm": 1.1850038766860962, "kl": 0.0051124971359968185, "learning_rate": 1.684491978609626e-06, "loss": 0.0002, "reward": 0.23237501084804535, "reward_std": 0.2627890706062317, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10737499594688416, "step": 189 }, { "completion_length": 191.0, "epoch": 0.10165864098448368, "grad_norm": 0.8362215757369995, "kl": 0.008282121270895004, "learning_rate": 1.6934046345811053e-06, "loss": 0.0003, "reward": 0.09537500143051147, "reward_std": 0.13657163083553314, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.015625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.07975000143051147, "step": 190 }, { "completion_length": 189.0, "epoch": 0.10219368646334938, "grad_norm": 0.9496135711669922, "kl": 0.01495653111487627, "learning_rate": 1.7023172905525848e-06, "loss": 0.0006, "reward": 0.1262499988079071, "reward_std": 0.07792506366968155, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.015625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1106249988079071, "step": 191 }, { "completion_length": 192.25, "epoch": 0.10272873194221509, "grad_norm": 0.5476675629615784, "kl": 0.009178446605801582, "learning_rate": 1.7112299465240642e-06, "loss": 0.0004, "reward": 0.07559375464916229, "reward_std": 0.07571912556886673, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0755937471985817, "step": 192 }, { "completion_length": 183.03125, "epoch": 0.1032637774210808, "grad_norm": 1.3964464664459229, "kl": 0.02117517590522766, "learning_rate": 1.720142602495544e-06, "loss": 0.0008, "reward": 0.3191874921321869, "reward_std": 0.5526077747344971, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.046875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08481249958276749, "step": 193 }, { "completion_length": 191.625, "epoch": 0.10379882289994649, "grad_norm": 1.2217278480529785, "kl": 0.00884051714092493, "learning_rate": 1.7290552584670235e-06, "loss": 0.0004, "reward": 0.09649999439716339, "reward_std": 0.11696694791316986, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.015625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08087500184774399, "step": 194 }, { "completion_length": 177.9375, "epoch": 0.1043338683788122, "grad_norm": 0.7894971966743469, "kl": 0.02088468335568905, "learning_rate": 1.7379679144385028e-06, "loss": 0.0008, "reward": 0.18056251108646393, "reward_std": 0.1515151560306549, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14931249618530273, "step": 195 }, { "completion_length": 183.875, "epoch": 0.10486891385767791, "grad_norm": 0.9627255797386169, "kl": 0.07147856056690216, "learning_rate": 1.7468805704099824e-06, "loss": 0.0029, "reward": 0.13215625286102295, "reward_std": 0.3324153423309326, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.015625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.05403124541044235, "step": 196 }, { "completion_length": 185.46875, "epoch": 0.1054039593365436, "grad_norm": 1.3429826498031616, "kl": 0.02192489430308342, "learning_rate": 1.7557932263814617e-06, "loss": 0.0009, "reward": 0.07784374803304672, "reward_std": 0.09708334505558014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.07784374803304672, "step": 197 }, { "completion_length": 169.84375, "epoch": 0.10593900481540931, "grad_norm": 1.5809382200241089, "kl": 0.0105059165507555, "learning_rate": 1.7647058823529414e-06, "loss": 0.0004, "reward": 0.29225000739097595, "reward_std": 0.39619874954223633, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.09375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13599999248981476, "step": 198 }, { "completion_length": 175.40625, "epoch": 0.10647405029427501, "grad_norm": 1188.3819580078125, "kl": 1.1922004222869873, "learning_rate": 1.7736185383244208e-06, "loss": 0.0477, "reward": 0.12003123760223389, "reward_std": 0.11514033377170563, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.015625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10440624505281448, "step": 199 }, { "completion_length": 169.625, "epoch": 0.10700909577314072, "grad_norm": 1.5337498188018799, "kl": 0.03621268272399902, "learning_rate": 1.7825311942959003e-06, "loss": 0.0014, "reward": 0.12653125822544098, "reward_std": 0.11770245432853699, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.015625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11090624332427979, "step": 200 }, { "completion_length": 173.0625, "epoch": 0.10754414125200643, "grad_norm": 2.5692551136016846, "kl": 0.06743980944156647, "learning_rate": 1.7914438502673799e-06, "loss": 0.0027, "reward": 0.4893125295639038, "reward_std": 0.6004013419151306, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11431249976158142, "step": 201 }, { "completion_length": 188.09375, "epoch": 0.10807918673087212, "grad_norm": 0.7396433353424072, "kl": 0.019359346479177475, "learning_rate": 1.8003565062388592e-06, "loss": 0.0008, "reward": 0.13231250643730164, "reward_std": 0.09656383097171783, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.015625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11668749153614044, "step": 202 }, { "completion_length": 172.25, "epoch": 0.10861423220973783, "grad_norm": 1.1002416610717773, "kl": 0.041391074657440186, "learning_rate": 1.809269162210339e-06, "loss": 0.0017, "reward": 0.1586875021457672, "reward_std": 0.3296080231666565, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.015625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08056250214576721, "step": 203 }, { "completion_length": 185.03125, "epoch": 0.10914927768860354, "grad_norm": 2.2023303508758545, "kl": 0.01351974904537201, "learning_rate": 1.8181818181818183e-06, "loss": 0.0005, "reward": 0.16215625405311584, "reward_std": 0.1295420080423355, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.046875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11528125405311584, "step": 204 }, { "completion_length": 174.25, "epoch": 0.10968432316746923, "grad_norm": 1.686858892440796, "kl": 0.013111666776239872, "learning_rate": 1.8270944741532978e-06, "loss": 0.0005, "reward": 0.20906250178813934, "reward_std": 0.33146917819976807, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.046875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09968749433755875, "step": 205 }, { "completion_length": 169.25, "epoch": 0.11021936864633494, "grad_norm": 0.867652416229248, "kl": 0.015391803346574306, "learning_rate": 1.8360071301247772e-06, "loss": 0.0006, "reward": 0.23771874606609344, "reward_std": 0.3231508433818817, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.046875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12834374606609344, "step": 206 }, { "completion_length": 150.96875, "epoch": 0.11075441412520064, "grad_norm": 2.8354673385620117, "kl": 0.037763502448797226, "learning_rate": 1.8449197860962567e-06, "loss": 0.0015, "reward": 0.6856250166893005, "reward_std": 0.6691159009933472, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.140625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16999998688697815, "step": 207 }, { "completion_length": 183.34375, "epoch": 0.11128945960406635, "grad_norm": 1.2572733163833618, "kl": 0.016670260578393936, "learning_rate": 1.8538324420677365e-06, "loss": 0.0007, "reward": 0.07450000196695328, "reward_std": 0.1045808494091034, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.07449999451637268, "step": 208 }, { "completion_length": 168.90625, "epoch": 0.11182450508293205, "grad_norm": 36.298152923583984, "kl": 0.3890148103237152, "learning_rate": 1.8627450980392158e-06, "loss": 0.0156, "reward": 0.22475001215934753, "reward_std": 0.34893137216567993, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09974999725818634, "step": 209 }, { "completion_length": 160.28125, "epoch": 0.11235955056179775, "grad_norm": 0.9047214984893799, "kl": 0.02724892646074295, "learning_rate": 1.8716577540106954e-06, "loss": 0.0011, "reward": 0.14284375309944153, "reward_std": 0.08393636345863342, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14284375309944153, "step": 210 }, { "completion_length": 182.3125, "epoch": 0.11289459604066346, "grad_norm": 1.1401950120925903, "kl": 0.015950046479701996, "learning_rate": 1.8805704099821747e-06, "loss": 0.0006, "reward": 0.1964375078678131, "reward_std": 0.27515050768852234, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.015625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11831249296665192, "step": 211 }, { "completion_length": 176.28125, "epoch": 0.11342964151952915, "grad_norm": 30.44670295715332, "kl": 0.2601291537284851, "learning_rate": 1.8894830659536542e-06, "loss": 0.0104, "reward": 0.3164687752723694, "reward_std": 0.553767204284668, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.046875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.082093745470047, "step": 212 }, { "completion_length": 194.9375, "epoch": 0.11396468699839486, "grad_norm": 0.6609180569648743, "kl": 0.012533347122371197, "learning_rate": 1.898395721925134e-06, "loss": 0.0005, "reward": 0.1666562408208847, "reward_std": 0.29151028394699097, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0729062557220459, "step": 213 }, { "completion_length": 181.25, "epoch": 0.11449973247726057, "grad_norm": 3.7581875324249268, "kl": 0.013152295723557472, "learning_rate": 1.9073083778966133e-06, "loss": 0.0005, "reward": 0.41571876406669617, "reward_std": 0.4311841130256653, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.078125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08759374916553497, "step": 214 }, { "completion_length": 175.28125, "epoch": 0.11503477795612627, "grad_norm": 1.0736836194992065, "kl": 0.035737328231334686, "learning_rate": 1.9162210338680927e-06, "loss": 0.0014, "reward": 0.06309375166893005, "reward_std": 0.15159545838832855, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.015625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.047468751668930054, "step": 215 }, { "completion_length": 166.6875, "epoch": 0.11556982343499198, "grad_norm": 37541.078125, "kl": 73.6550064086914, "learning_rate": 1.9251336898395724e-06, "loss": 2.9462, "reward": 0.304437518119812, "reward_std": 0.4352279007434845, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11693750321865082, "step": 216 }, { "completion_length": 174.0, "epoch": 0.11610486891385768, "grad_norm": 12.090673446655273, "kl": 0.37115001678466797, "learning_rate": 1.9340463458110517e-06, "loss": 0.0148, "reward": 0.29356250166893005, "reward_std": 0.41979509592056274, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.078125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09043750166893005, "step": 217 }, { "completion_length": 192.53125, "epoch": 0.11663991439272338, "grad_norm": 0.8689852356910706, "kl": 0.014298921450972557, "learning_rate": 1.9429590017825315e-06, "loss": 0.0006, "reward": 0.08756250143051147, "reward_std": 0.06665623933076859, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08756250143051147, "step": 218 }, { "completion_length": 189.96875, "epoch": 0.11717495987158909, "grad_norm": 5.99603796005249, "kl": 0.019608888775110245, "learning_rate": 1.951871657754011e-06, "loss": 0.0008, "reward": 0.11353125423192978, "reward_std": 0.1620996594429016, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.05103125050663948, "step": 219 }, { "completion_length": 168.71875, "epoch": 0.11771000535045478, "grad_norm": 0.7680837512016296, "kl": 0.02420741878449917, "learning_rate": 1.96078431372549e-06, "loss": 0.001, "reward": 0.5180624723434448, "reward_std": 0.6956161260604858, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.09375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11181250214576721, "step": 220 }, { "completion_length": 172.40625, "epoch": 0.11824505082932049, "grad_norm": 2.592491388320923, "kl": 0.020366720855236053, "learning_rate": 1.96969696969697e-06, "loss": 0.0008, "reward": 0.46518751978874207, "reward_std": 0.5836637020111084, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.078125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13706250488758087, "step": 221 }, { "completion_length": 170.59375, "epoch": 0.1187800963081862, "grad_norm": 2.2480854988098145, "kl": 0.018065165728330612, "learning_rate": 1.9786096256684497e-06, "loss": 0.0007, "reward": 0.25043749809265137, "reward_std": 0.31738242506980896, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.14106249809265137, "step": 222 }, { "completion_length": 176.125, "epoch": 0.1193151417870519, "grad_norm": 4.050024509429932, "kl": 0.07714779675006866, "learning_rate": 1.987522281639929e-06, "loss": 0.0031, "reward": 0.18471874296665192, "reward_std": 0.1410328894853592, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.078125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10659374296665192, "step": 223 }, { "completion_length": 172.0625, "epoch": 0.1198501872659176, "grad_norm": 0.6202077269554138, "kl": 0.011261794716119766, "learning_rate": 1.9964349376114083e-06, "loss": 0.0005, "reward": 0.2628437578678131, "reward_std": 0.5001048445701599, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.07534375041723251, "step": 224 }, { "completion_length": 186.375, "epoch": 0.12038523274478331, "grad_norm": 0.7025286555290222, "kl": 0.01153456512838602, "learning_rate": 2.0053475935828877e-06, "loss": 0.0005, "reward": 0.08271874487400055, "reward_std": 0.12659746408462524, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.015625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.06709375232458115, "step": 225 }, { "completion_length": 179.125, "epoch": 0.12092027822364901, "grad_norm": 1.3802123069763184, "kl": 0.010395502671599388, "learning_rate": 2.0142602495543674e-06, "loss": 0.0004, "reward": 0.23103123903274536, "reward_std": 0.31301945447921753, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10603125393390656, "step": 226 }, { "completion_length": 189.8125, "epoch": 0.12145532370251472, "grad_norm": 1.8801831007003784, "kl": 0.011835373938083649, "learning_rate": 2.023172905525847e-06, "loss": 0.0005, "reward": 0.3109062612056732, "reward_std": 0.3714413344860077, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12340625375509262, "step": 227 }, { "completion_length": 182.5625, "epoch": 0.12199036918138041, "grad_norm": 24.300010681152344, "kl": 0.0541793629527092, "learning_rate": 2.0320855614973265e-06, "loss": 0.0022, "reward": 0.14249999821186066, "reward_std": 0.10343141853809357, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11124999076128006, "step": 228 }, { "completion_length": 192.53125, "epoch": 0.12252541466024612, "grad_norm": 0.7531638741493225, "kl": 0.015188846737146378, "learning_rate": 2.040998217468806e-06, "loss": 0.0006, "reward": 0.09800000488758087, "reward_std": 0.10297395288944244, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.015625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08237499743700027, "step": 229 }, { "completion_length": 172.90625, "epoch": 0.12306046013911183, "grad_norm": 17818974.0, "kl": 11401.6591796875, "learning_rate": 2.049910873440285e-06, "loss": 456.0663, "reward": 0.2016250044107437, "reward_std": 0.3747360110282898, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.046875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09225000441074371, "step": 230 }, { "completion_length": 167.09375, "epoch": 0.12359550561797752, "grad_norm": 3.256290912628174, "kl": 0.042127981781959534, "learning_rate": 2.058823529411765e-06, "loss": 0.0017, "reward": 0.2524375021457672, "reward_std": 0.18902841210365295, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.109375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1430625021457672, "step": 231 }, { "completion_length": 166.15625, "epoch": 0.12413055109684323, "grad_norm": 0.6679757833480835, "kl": 0.026257432997226715, "learning_rate": 2.0677361853832447e-06, "loss": 0.0011, "reward": 0.19090625643730164, "reward_std": 0.18994548916816711, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.046875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14403125643730164, "step": 232 }, { "completion_length": 171.53125, "epoch": 0.12466559657570893, "grad_norm": 236357312.0, "kl": 486972.46875, "learning_rate": 2.076648841354724e-06, "loss": 19478.8984, "reward": 0.308843731880188, "reward_std": 0.49600422382354736, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.046875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.136968731880188, "step": 233 }, { "completion_length": 165.8125, "epoch": 0.12520064205457465, "grad_norm": 0.5764390826225281, "kl": 0.01799147017300129, "learning_rate": 2.0855614973262034e-06, "loss": 0.0007, "reward": 0.1498437523841858, "reward_std": 0.12119601666927338, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11859375238418579, "step": 234 }, { "completion_length": 160.875, "epoch": 0.12573568753344033, "grad_norm": 1.6355236768722534, "kl": 0.06465653330087662, "learning_rate": 2.0944741532976827e-06, "loss": 0.0026, "reward": 0.5626875162124634, "reward_std": 0.5894583463668823, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.09375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15643750131130219, "step": 235 }, { "completion_length": 173.5, "epoch": 0.12627073301230604, "grad_norm": 1.425333857536316, "kl": 0.012852119281888008, "learning_rate": 2.1033868092691625e-06, "loss": 0.0005, "reward": 0.2241249978542328, "reward_std": 0.28956013917922974, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.046875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11475000530481339, "step": 236 }, { "completion_length": 164.5, "epoch": 0.12680577849117175, "grad_norm": 5.129726409912109, "kl": 0.06404737383127213, "learning_rate": 2.112299465240642e-06, "loss": 0.0026, "reward": 0.3070937395095825, "reward_std": 0.5573974251747131, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.046875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1352187544107437, "step": 237 }, { "completion_length": 180.125, "epoch": 0.12734082397003746, "grad_norm": 2.2154388427734375, "kl": 0.03265200927853584, "learning_rate": 2.1212121212121216e-06, "loss": 0.0013, "reward": 0.20643749833106995, "reward_std": 0.3579389750957489, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08143749833106995, "step": 238 }, { "completion_length": 183.5, "epoch": 0.12787586944890317, "grad_norm": 3.3060500621795654, "kl": 0.04142360761761665, "learning_rate": 2.130124777183601e-06, "loss": 0.0017, "reward": 0.3240937292575836, "reward_std": 0.41078805923461914, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.109375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08971875160932541, "step": 239 }, { "completion_length": 165.1875, "epoch": 0.12841091492776885, "grad_norm": 5.585768222808838, "kl": 0.013097296468913555, "learning_rate": 2.1390374331550802e-06, "loss": 0.0005, "reward": 0.19884376227855682, "reward_std": 0.18286773562431335, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.078125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12071874737739563, "step": 240 }, { "completion_length": 180.375, "epoch": 0.12894596040663456, "grad_norm": 47.46510696411133, "kl": 0.17976374924182892, "learning_rate": 2.14795008912656e-06, "loss": 0.0072, "reward": 0.1041875034570694, "reward_std": 0.09747858345508575, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1041875034570694, "step": 241 }, { "completion_length": 177.40625, "epoch": 0.12948100588550027, "grad_norm": 1.6572966575622559, "kl": 0.016523832455277443, "learning_rate": 2.1568627450980393e-06, "loss": 0.0007, "reward": 0.5615625381469727, "reward_std": 0.3327963948249817, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.09375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15531250834465027, "step": 242 }, { "completion_length": 168.40625, "epoch": 0.13001605136436598, "grad_norm": 1.755509376525879, "kl": 0.021043308079242706, "learning_rate": 2.165775401069519e-06, "loss": 0.0008, "reward": 0.2681249976158142, "reward_std": 0.3940923511981964, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.078125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1274999976158142, "step": 243 }, { "completion_length": 157.96875, "epoch": 0.13055109684323168, "grad_norm": 4.561159610748291, "kl": 0.03638424724340439, "learning_rate": 2.1746880570409984e-06, "loss": 0.0015, "reward": 0.3069687485694885, "reward_std": 0.35569483041763306, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.046875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13509374856948853, "step": 244 }, { "completion_length": 188.84375, "epoch": 0.13108614232209737, "grad_norm": 2.6796376705169678, "kl": 0.007816595025360584, "learning_rate": 2.1836007130124777e-06, "loss": 0.0003, "reward": 0.18199999630451202, "reward_std": 0.16452434659004211, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11949999630451202, "step": 245 }, { "completion_length": 177.71875, "epoch": 0.13162118780096307, "grad_norm": 0.8215219378471375, "kl": 0.01905321702361107, "learning_rate": 2.1925133689839575e-06, "loss": 0.0008, "reward": 0.28443750739097595, "reward_std": 0.42955344915390015, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.046875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11256249994039536, "step": 246 }, { "completion_length": 168.71875, "epoch": 0.13215623327982878, "grad_norm": 1.5308873653411865, "kl": 0.043150462210178375, "learning_rate": 2.201426024955437e-06, "loss": 0.0017, "reward": 0.18700000643730164, "reward_std": 0.2159353792667389, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.078125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10887500643730164, "step": 247 }, { "completion_length": 151.21875, "epoch": 0.1326912787586945, "grad_norm": 1.3490028381347656, "kl": 0.022311992943286896, "learning_rate": 2.2103386809269166e-06, "loss": 0.0009, "reward": 0.610406219959259, "reward_std": 0.6173575520515442, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17290624976158142, "step": 248 }, { "completion_length": 183.6875, "epoch": 0.1332263242375602, "grad_norm": 1.6245038509368896, "kl": 0.022144947201013565, "learning_rate": 2.219251336898396e-06, "loss": 0.0009, "reward": 0.15546876192092896, "reward_std": 0.18019449710845947, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.046875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10859375447034836, "step": 249 }, { "completion_length": 182.78125, "epoch": 0.1337613697164259, "grad_norm": 4.402322769165039, "kl": 0.019585631787776947, "learning_rate": 2.2281639928698752e-06, "loss": 0.0008, "reward": 0.26893749833106995, "reward_std": 0.40810951590538025, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.078125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.06581249833106995, "step": 250 }, { "completion_length": 172.96875, "epoch": 0.1342964151952916, "grad_norm": 0.7279666662216187, "kl": 0.049347613006830215, "learning_rate": 2.2370766488413546e-06, "loss": 0.002, "reward": 0.2277500033378601, "reward_std": 0.2958270311355591, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1340000033378601, "step": 251 }, { "completion_length": 177.59375, "epoch": 0.1348314606741573, "grad_norm": 2.534862756729126, "kl": 0.016207261011004448, "learning_rate": 2.2459893048128343e-06, "loss": 0.0006, "reward": 0.19943751394748688, "reward_std": 0.1633104532957077, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13693749904632568, "step": 252 }, { "completion_length": 151.3125, "epoch": 0.135366506153023, "grad_norm": 0.877824604511261, "kl": 0.03462403267621994, "learning_rate": 2.254901960784314e-06, "loss": 0.0014, "reward": 0.5677499771118164, "reward_std": 0.829249382019043, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.109375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458750069141388, "step": 253 }, { "completion_length": 163.625, "epoch": 0.13590155163188872, "grad_norm": 4.797736167907715, "kl": 0.02084248885512352, "learning_rate": 2.2638146167557934e-06, "loss": 0.0008, "reward": 0.47487497329711914, "reward_std": 0.6357218623161316, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.09375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13112498819828033, "step": 254 }, { "completion_length": 155.09375, "epoch": 0.13643659711075443, "grad_norm": 0.6609204411506653, "kl": 0.015083454549312592, "learning_rate": 2.2727272727272728e-06, "loss": 0.0006, "reward": 0.8069062232971191, "reward_std": 0.853371262550354, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.15625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15065625309944153, "step": 255 }, { "completion_length": 166.65625, "epoch": 0.1369716425896201, "grad_norm": 1.3789000511169434, "kl": 0.026414338499307632, "learning_rate": 2.281639928698752e-06, "loss": 0.0011, "reward": 0.21453124284744263, "reward_std": 0.3256036043167114, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.046875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10515625029802322, "step": 256 }, { "completion_length": 156.71875, "epoch": 0.13750668806848582, "grad_norm": 2.9521067142486572, "kl": 0.031738974153995514, "learning_rate": 2.290552584670232e-06, "loss": 0.0013, "reward": 0.4349687695503235, "reward_std": 0.5385340452194214, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.078125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1693437546491623, "step": 257 }, { "completion_length": 174.53125, "epoch": 0.13804173354735153, "grad_norm": 6863563264.0, "kl": 20662610.0, "learning_rate": 2.2994652406417116e-06, "loss": 826504.5, "reward": 0.42109373211860657, "reward_std": 0.5653202533721924, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.109375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12421874701976776, "step": 258 }, { "completion_length": 187.9375, "epoch": 0.13857677902621723, "grad_norm": 2.4365122318267822, "kl": 0.017830926924943924, "learning_rate": 2.308377896613191e-06, "loss": 0.0007, "reward": 0.14359374344348907, "reward_std": 0.11869947612285614, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11234374344348907, "step": 259 }, { "completion_length": 166.53125, "epoch": 0.13911182450508294, "grad_norm": 1.2235019207000732, "kl": 0.02235807105898857, "learning_rate": 2.3172905525846703e-06, "loss": 0.0009, "reward": 0.18712501227855682, "reward_std": 0.1488221287727356, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.015625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.15587499737739563, "step": 260 }, { "completion_length": 172.96875, "epoch": 0.13964686998394862, "grad_norm": 2.1417160034179688, "kl": 0.027957623824477196, "learning_rate": 2.32620320855615e-06, "loss": 0.0011, "reward": 0.33250001072883606, "reward_std": 0.3669039309024811, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.140625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12937499582767487, "step": 261 }, { "completion_length": 156.15625, "epoch": 0.14018191546281433, "grad_norm": 1.6583397388458252, "kl": 0.03683379665017128, "learning_rate": 2.3351158645276294e-06, "loss": 0.0015, "reward": 0.25600001215934753, "reward_std": 0.33171916007995605, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.046875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14662499725818634, "step": 262 }, { "completion_length": 174.5625, "epoch": 0.14071696094168004, "grad_norm": 0.6746355295181274, "kl": 0.021797701716423035, "learning_rate": 2.344028520499109e-06, "loss": 0.0009, "reward": 0.24971875548362732, "reward_std": 0.3027639389038086, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.078125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10909375548362732, "step": 263 }, { "completion_length": 157.9375, "epoch": 0.14125200642054575, "grad_norm": 1.4242279529571533, "kl": 0.0612359493970871, "learning_rate": 2.3529411764705885e-06, "loss": 0.0024, "reward": 0.16246874630451202, "reward_std": 0.06427103281021118, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16246876120567322, "step": 264 }, { "completion_length": 181.03125, "epoch": 0.14178705189941146, "grad_norm": 2.6077311038970947, "kl": 0.055326029658317566, "learning_rate": 2.3618538324420678e-06, "loss": 0.0022, "reward": 0.3373124897480011, "reward_std": 0.5901380777359009, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.078125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0716874971985817, "step": 265 }, { "completion_length": 175.40625, "epoch": 0.14232209737827714, "grad_norm": 2.659674644470215, "kl": 0.044008441269397736, "learning_rate": 2.3707664884135475e-06, "loss": 0.0018, "reward": 0.28862500190734863, "reward_std": 0.3666527271270752, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.109375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11675000190734863, "step": 266 }, { "completion_length": 148.375, "epoch": 0.14285714285714285, "grad_norm": 1.5463439226150513, "kl": 0.029985371977090836, "learning_rate": 2.379679144385027e-06, "loss": 0.0012, "reward": 0.6830624938011169, "reward_std": 0.5734925866127014, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.140625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16743749380111694, "step": 267 }, { "completion_length": 170.78125, "epoch": 0.14339218833600856, "grad_norm": 1331.209228515625, "kl": 9.698622703552246, "learning_rate": 2.3885918003565066e-06, "loss": 0.3879, "reward": 0.4349374771118164, "reward_std": 0.40762215852737427, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1224374920129776, "step": 268 }, { "completion_length": 157.34375, "epoch": 0.14392723381487427, "grad_norm": 3.7497780323028564, "kl": 0.08676715940237045, "learning_rate": 2.397504456327986e-06, "loss": 0.0035, "reward": 0.1510937511920929, "reward_std": 0.11424301564693451, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.015625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1354687511920929, "step": 269 }, { "completion_length": 165.0625, "epoch": 0.14446227929373998, "grad_norm": 1.7408291101455688, "kl": 0.0413704589009285, "learning_rate": 2.4064171122994653e-06, "loss": 0.0017, "reward": 0.39759373664855957, "reward_std": 0.5336105823516846, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.109375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.14759375154972076, "step": 270 }, { "completion_length": 165.375, "epoch": 0.14499732477260568, "grad_norm": 0.6810373663902283, "kl": 0.01655806042253971, "learning_rate": 2.415329768270945e-06, "loss": 0.0007, "reward": 0.25809377431869507, "reward_std": 0.38406676054000854, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.046875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14871874451637268, "step": 271 }, { "completion_length": 170.59375, "epoch": 0.14553237025147137, "grad_norm": 217133.046875, "kl": 28240.958984375, "learning_rate": 2.4242424242424244e-06, "loss": 1129.6384, "reward": 0.19090625643730164, "reward_std": 0.17806828022003174, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12840625643730164, "step": 272 }, { "completion_length": 163.03125, "epoch": 0.14606741573033707, "grad_norm": 1.2362523078918457, "kl": 0.012762494385242462, "learning_rate": 2.433155080213904e-06, "loss": 0.0005, "reward": 0.3517812490463257, "reward_std": 0.3707553446292877, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.078125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14865624904632568, "step": 273 }, { "completion_length": 157.03125, "epoch": 0.14660246120920278, "grad_norm": 3.5688905715942383, "kl": 0.04107591509819031, "learning_rate": 2.4420677361853835e-06, "loss": 0.0016, "reward": 0.222781240940094, "reward_std": 0.3117217719554901, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.015625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.144656240940094, "step": 274 }, { "completion_length": 165.125, "epoch": 0.1471375066880685, "grad_norm": 0.8056825399398804, "kl": 0.028582176193594933, "learning_rate": 2.450980392156863e-06, "loss": 0.0011, "reward": 0.414187490940094, "reward_std": 0.7129790782928467, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.09375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.132937490940094, "step": 275 }, { "completion_length": 162.34375, "epoch": 0.1476725521669342, "grad_norm": 1.9868731498718262, "kl": 0.03849145025014877, "learning_rate": 2.4598930481283426e-06, "loss": 0.0015, "reward": 0.4229375123977661, "reward_std": 0.3949018716812134, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.140625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15731249749660492, "step": 276 }, { "completion_length": 166.21875, "epoch": 0.14820759764579988, "grad_norm": 1.0291547775268555, "kl": 0.022729871794581413, "learning_rate": 2.468805704099822e-06, "loss": 0.0009, "reward": 0.4741874933242798, "reward_std": 0.5843730568885803, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.15625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13043749332427979, "step": 277 }, { "completion_length": 171.34375, "epoch": 0.1487426431246656, "grad_norm": 1.1588283777236938, "kl": 0.030043980106711388, "learning_rate": 2.4777183600713017e-06, "loss": 0.0012, "reward": 0.18196876347064972, "reward_std": 0.14943572878837585, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.046875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13509374856948853, "step": 278 }, { "completion_length": 171.84375, "epoch": 0.1492776886035313, "grad_norm": 1.171825885772705, "kl": 0.019486159086227417, "learning_rate": 2.486631016042781e-06, "loss": 0.0008, "reward": 0.12006250023841858, "reward_std": 0.11729497462511063, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.015625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10443750023841858, "step": 279 }, { "completion_length": 161.53125, "epoch": 0.149812734082397, "grad_norm": 1.6589341163635254, "kl": 0.060612570494413376, "learning_rate": 2.4955436720142603e-06, "loss": 0.0024, "reward": 0.2253125011920929, "reward_std": 0.3611285090446472, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.046875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1159375011920929, "step": 280 }, { "completion_length": 168.875, "epoch": 0.15034777956126272, "grad_norm": 0.9924216866493225, "kl": 0.030281612649559975, "learning_rate": 2.5044563279857397e-06, "loss": 0.0012, "reward": 0.6079687476158142, "reward_std": 0.7239983677864075, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.109375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12359374761581421, "step": 281 }, { "completion_length": 178.625, "epoch": 0.1508828250401284, "grad_norm": 1.5958192348480225, "kl": 0.05434023588895798, "learning_rate": 2.5133689839572194e-06, "loss": 0.0022, "reward": 0.3649062514305115, "reward_std": 0.4032500982284546, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.140625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09928125143051147, "step": 282 }, { "completion_length": 184.9375, "epoch": 0.1514178705189941, "grad_norm": 545.2910766601562, "kl": 0.445149302482605, "learning_rate": 2.522281639928699e-06, "loss": 0.0178, "reward": 0.117125004529953, "reward_std": 0.07017260044813156, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.015625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1014999970793724, "step": 283 }, { "completion_length": 183.6875, "epoch": 0.15195291599785982, "grad_norm": 4.196126461029053, "kl": 0.026303794234991074, "learning_rate": 2.5311942959001785e-06, "loss": 0.0011, "reward": 0.23196876049041748, "reward_std": 0.31441229581832886, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10696875303983688, "step": 284 }, { "completion_length": 152.71875, "epoch": 0.15248796147672553, "grad_norm": 1.647562861442566, "kl": 0.058844536542892456, "learning_rate": 2.5401069518716583e-06, "loss": 0.0024, "reward": 0.3333437442779541, "reward_std": 0.41381070017814636, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458437591791153, "step": 285 }, { "completion_length": 170.03125, "epoch": 0.15302300695559123, "grad_norm": 0.944121778011322, "kl": 0.05548493564128876, "learning_rate": 2.549019607843137e-06, "loss": 0.0022, "reward": 0.2535000145435333, "reward_std": 0.22786268591880798, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.09375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.14412499964237213, "step": 286 }, { "completion_length": 172.5625, "epoch": 0.15355805243445692, "grad_norm": 8.643299102783203, "kl": 0.03655913844704628, "learning_rate": 2.557932263814617e-06, "loss": 0.0015, "reward": 0.1550624966621399, "reward_std": 0.15778206288814545, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12381249666213989, "step": 287 }, { "completion_length": 168.84375, "epoch": 0.15409309791332262, "grad_norm": 0.7875510454177856, "kl": 0.019801558926701546, "learning_rate": 2.5668449197860967e-06, "loss": 0.0008, "reward": 0.2901874780654907, "reward_std": 0.33798348903656006, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.078125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1495625078678131, "step": 288 }, { "completion_length": 168.625, "epoch": 0.15462814339218833, "grad_norm": 0.7711018919944763, "kl": 0.029962655156850815, "learning_rate": 2.575757575757576e-06, "loss": 0.0012, "reward": 0.43809375166893005, "reward_std": 0.4427492618560791, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.09375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15684375166893005, "step": 289 }, { "completion_length": 178.28125, "epoch": 0.15516318887105404, "grad_norm": 1872369.5, "kl": 1826.2122802734375, "learning_rate": 2.5846702317290558e-06, "loss": 73.0485, "reward": 0.2369062602519989, "reward_std": 0.27444273233413696, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11190624535083771, "step": 290 }, { "completion_length": 160.53125, "epoch": 0.15569823434991975, "grad_norm": 12.07524299621582, "kl": 0.26067495346069336, "learning_rate": 2.5935828877005347e-06, "loss": 0.0104, "reward": 0.4492499828338623, "reward_std": 0.6106736063957214, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.09375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1679999977350235, "step": 291 }, { "completion_length": 175.15625, "epoch": 0.15623327982878546, "grad_norm": 2.875174045562744, "kl": 0.08309973776340485, "learning_rate": 2.6024955436720144e-06, "loss": 0.0033, "reward": 0.5605937838554382, "reward_std": 0.6128597259521484, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12309375405311584, "step": 292 }, { "completion_length": 176.1875, "epoch": 0.15676832530765114, "grad_norm": 1.4359440803527832, "kl": 0.016935978084802628, "learning_rate": 2.611408199643494e-06, "loss": 0.0007, "reward": 0.09950000047683716, "reward_std": 0.11718019843101501, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.015625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08387499302625656, "step": 293 }, { "completion_length": 162.21875, "epoch": 0.15730337078651685, "grad_norm": 0.6774555444717407, "kl": 0.02512301504611969, "learning_rate": 2.6203208556149735e-06, "loss": 0.001, "reward": 0.39371874928474426, "reward_std": 0.46549344062805176, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14371874928474426, "step": 294 }, { "completion_length": 166.375, "epoch": 0.15783841626538256, "grad_norm": 3.680288314819336, "kl": 0.04839645326137543, "learning_rate": 2.6292335115864533e-06, "loss": 0.0019, "reward": 0.416812539100647, "reward_std": 0.39651110768318176, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.09375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13556250929832458, "step": 295 }, { "completion_length": 158.625, "epoch": 0.15837346174424827, "grad_norm": 0.6054403185844421, "kl": 0.018837904557585716, "learning_rate": 2.638146167557932e-06, "loss": 0.0008, "reward": 0.2083750069141388, "reward_std": 0.1730749011039734, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.046875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1615000069141388, "step": 296 }, { "completion_length": 161.71875, "epoch": 0.15890850722311398, "grad_norm": 1.6553435325622559, "kl": 0.04707062244415283, "learning_rate": 2.647058823529412e-06, "loss": 0.0019, "reward": 0.14478124678134918, "reward_std": 0.13686253130435944, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.015625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12915624678134918, "step": 297 }, { "completion_length": 148.03125, "epoch": 0.15944355270197966, "grad_norm": 2.7282602787017822, "kl": 0.04315043240785599, "learning_rate": 2.6559714795008917e-06, "loss": 0.0017, "reward": 0.656499981880188, "reward_std": 0.5126872062683105, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15650001168251038, "step": 298 }, { "completion_length": 176.875, "epoch": 0.15997859818084537, "grad_norm": 315075.21875, "kl": 398.23797607421875, "learning_rate": 2.664884135472371e-06, "loss": 15.9295, "reward": 0.40321871638298035, "reward_std": 0.38993558287620544, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.078125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13759374618530273, "step": 299 }, { "completion_length": 182.46875, "epoch": 0.16051364365971107, "grad_norm": 1.1703861951828003, "kl": 0.033450640738010406, "learning_rate": 2.673796791443851e-06, "loss": 0.0013, "reward": 0.1315937489271164, "reward_std": 0.07619395107030869, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.015625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1159687489271164, "step": 300 }, { "completion_length": 157.625, "epoch": 0.16104868913857678, "grad_norm": 1.8125144243240356, "kl": 0.09640952944755554, "learning_rate": 2.6827094474153297e-06, "loss": 0.0039, "reward": 0.6019062399864197, "reward_std": 0.6797053813934326, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.140625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14878125488758087, "step": 301 }, { "completion_length": 176.1875, "epoch": 0.1615837346174425, "grad_norm": 1.317686915397644, "kl": 0.03859119117259979, "learning_rate": 2.6916221033868095e-06, "loss": 0.0015, "reward": 0.5389062166213989, "reward_std": 0.4231955409049988, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.09375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1326562464237213, "step": 302 }, { "completion_length": 176.34375, "epoch": 0.16211878009630817, "grad_norm": 1.063449501991272, "kl": 0.04150380566716194, "learning_rate": 2.7005347593582892e-06, "loss": 0.0017, "reward": 0.3420937657356262, "reward_std": 0.4237964451313019, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.078125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13896875083446503, "step": 303 }, { "completion_length": 172.25, "epoch": 0.16265382557517388, "grad_norm": 148.07907104492188, "kl": 0.4606086015701294, "learning_rate": 2.7094474153297686e-06, "loss": 0.0184, "reward": 0.25465625524520874, "reward_std": 0.2070089876651764, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.140625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11403125524520874, "step": 304 }, { "completion_length": 149.875, "epoch": 0.1631888710540396, "grad_norm": 1.8406124114990234, "kl": 0.05775555968284607, "learning_rate": 2.7183600713012483e-06, "loss": 0.0023, "reward": 0.18071874976158142, "reward_std": 0.12641805410385132, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.015625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16509374976158142, "step": 305 }, { "completion_length": 160.90625, "epoch": 0.1637239165329053, "grad_norm": 0.8541836738586426, "kl": 0.0466364249587059, "learning_rate": 2.7272727272727272e-06, "loss": 0.0019, "reward": 0.5077500343322754, "reward_std": 0.3775889575481415, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.09375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1639999896287918, "step": 306 }, { "completion_length": 171.78125, "epoch": 0.164258962011771, "grad_norm": 0.7897668480873108, "kl": 0.031769637018442154, "learning_rate": 2.736185383244207e-06, "loss": 0.0013, "reward": 0.7522187829017639, "reward_std": 0.7085415124893188, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.171875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14284375309944153, "step": 307 }, { "completion_length": 191.75, "epoch": 0.1647940074906367, "grad_norm": 79164.65625, "kl": 82.3728256225586, "learning_rate": 2.7450980392156867e-06, "loss": 3.2949, "reward": 0.23140625655651093, "reward_std": 0.24558579921722412, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10640625655651093, "step": 308 }, { "completion_length": 167.15625, "epoch": 0.1653290529695024, "grad_norm": 2.9101619720458984, "kl": 0.03629949316382408, "learning_rate": 2.754010695187166e-06, "loss": 0.0015, "reward": 0.3296562433242798, "reward_std": 0.4048430025577545, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.078125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12653125822544098, "step": 309 }, { "completion_length": 179.5, "epoch": 0.1658640984483681, "grad_norm": 2.1971540451049805, "kl": 0.038501616567373276, "learning_rate": 2.762923351158646e-06, "loss": 0.0015, "reward": 0.2917500138282776, "reward_std": 0.44202548265457153, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.109375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1198749989271164, "step": 310 }, { "completion_length": 158.4375, "epoch": 0.16639914392723382, "grad_norm": 2.382782459259033, "kl": 0.04860544577240944, "learning_rate": 2.7718360071301247e-06, "loss": 0.0019, "reward": 0.20521876215934753, "reward_std": 0.2618916630744934, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.015625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12709374725818634, "step": 311 }, { "completion_length": 125.9375, "epoch": 0.16693418940609953, "grad_norm": 1.1022450923919678, "kl": 0.05502060428261757, "learning_rate": 2.7807486631016045e-06, "loss": 0.0022, "reward": 0.6846874952316284, "reward_std": 0.6560103893280029, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21593749523162842, "step": 312 }, { "completion_length": 158.0, "epoch": 0.16746923488496523, "grad_norm": 0.9111614227294922, "kl": 0.052071698009967804, "learning_rate": 2.7896613190730843e-06, "loss": 0.0021, "reward": 0.6464375257492065, "reward_std": 1.0500140190124512, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.09375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17768749594688416, "step": 313 }, { "completion_length": 123.5, "epoch": 0.16800428036383092, "grad_norm": 1.3847737312316895, "kl": 0.09401572495698929, "learning_rate": 2.7985739750445636e-06, "loss": 0.0038, "reward": 0.6441875100135803, "reward_std": 0.7948089838027954, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.109375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.22231249511241913, "step": 314 }, { "completion_length": 163.8125, "epoch": 0.16853932584269662, "grad_norm": 1.3124598264694214, "kl": 0.04251493141055107, "learning_rate": 2.807486631016043e-06, "loss": 0.0017, "reward": 0.3604687452316284, "reward_std": 0.4033702611923218, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.09375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14171874523162842, "step": 315 }, { "completion_length": 173.625, "epoch": 0.16907437132156233, "grad_norm": 0.9373641610145569, "kl": 0.04443620145320892, "learning_rate": 2.8163992869875223e-06, "loss": 0.0018, "reward": 0.4399687647819519, "reward_std": 0.36363962292671204, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.09375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1587187498807907, "step": 316 }, { "completion_length": 152.28125, "epoch": 0.16960941680042804, "grad_norm": 14.069267272949219, "kl": 0.39573031663894653, "learning_rate": 2.825311942959002e-06, "loss": 0.0158, "reward": 0.40856248140335083, "reward_std": 0.48961788415908813, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.109375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17418751120567322, "step": 317 }, { "completion_length": 162.15625, "epoch": 0.17014446227929375, "grad_norm": 23.355316162109375, "kl": 0.15103361010551453, "learning_rate": 2.8342245989304818e-06, "loss": 0.006, "reward": 0.6880624890327454, "reward_std": 0.7022018432617188, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.140625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10993750393390656, "step": 318 }, { "completion_length": 186.0, "epoch": 0.17067950775815943, "grad_norm": 0.6632013916969299, "kl": 0.031246036291122437, "learning_rate": 2.843137254901961e-06, "loss": 0.0012, "reward": 0.14884375035762787, "reward_std": 0.11221294105052948, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11759375035762787, "step": 319 }, { "completion_length": 174.125, "epoch": 0.17121455323702514, "grad_norm": 24.013641357421875, "kl": 0.06196340173482895, "learning_rate": 2.8520499108734404e-06, "loss": 0.0025, "reward": 0.25312501192092896, "reward_std": 0.2830520570278168, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.078125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11249999701976776, "step": 320 }, { "completion_length": 163.46875, "epoch": 0.17174959871589085, "grad_norm": 1.6039682626724243, "kl": 0.06345614045858383, "learning_rate": 2.8609625668449198e-06, "loss": 0.0025, "reward": 0.17784374952316284, "reward_std": 0.12474849820137024, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14659374952316284, "step": 321 }, { "completion_length": 176.875, "epoch": 0.17228464419475656, "grad_norm": 0.7074977159500122, "kl": 0.04023730009794235, "learning_rate": 2.8698752228163995e-06, "loss": 0.0016, "reward": 0.2057812511920929, "reward_std": 0.16390138864517212, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.09375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1120312437415123, "step": 322 }, { "completion_length": 156.46875, "epoch": 0.17281968967362227, "grad_norm": 2.651322364807129, "kl": 0.056397490203380585, "learning_rate": 2.8787878787878793e-06, "loss": 0.0023, "reward": 0.6474375128746033, "reward_std": 0.6546005010604858, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.234375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16306249797344208, "step": 323 }, { "completion_length": 171.4375, "epoch": 0.17335473515248795, "grad_norm": 0.9726559519767761, "kl": 0.0443711057305336, "learning_rate": 2.8877005347593586e-06, "loss": 0.0018, "reward": 0.5163124799728394, "reward_std": 0.6498225927352905, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.140625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12568749487400055, "step": 324 }, { "completion_length": 158.96875, "epoch": 0.17388978063135366, "grad_norm": 1.791941523551941, "kl": 0.043454863131046295, "learning_rate": 2.896613190730838e-06, "loss": 0.0017, "reward": 0.7900000214576721, "reward_std": 0.7572901248931885, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.203125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14937500655651093, "step": 325 }, { "completion_length": 169.21875, "epoch": 0.17442482611021937, "grad_norm": 0.7924731969833374, "kl": 0.030979512259364128, "learning_rate": 2.9055258467023173e-06, "loss": 0.0012, "reward": 0.30559372901916504, "reward_std": 0.3740181028842926, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.046875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": 0.16496875882148743, "step": 326 }, { "completion_length": 143.71875, "epoch": 0.17495987158908508, "grad_norm": 1.0253592729568481, "kl": 0.06509197503328323, "learning_rate": 2.914438502673797e-06, "loss": 0.0026, "reward": 0.6199687719345093, "reward_std": 0.35781434178352356, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.171875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.18246875703334808, "step": 327 }, { "completion_length": 167.03125, "epoch": 0.17549491706795078, "grad_norm": 1.13686203956604, "kl": 0.04536311700940132, "learning_rate": 2.923351158645277e-06, "loss": 0.0018, "reward": 0.1446875035762787, "reward_std": 0.1188635379076004, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.015625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1290625035762787, "step": 328 }, { "completion_length": 147.71875, "epoch": 0.1760299625468165, "grad_norm": 1.484069585800171, "kl": 0.0994771420955658, "learning_rate": 2.9322638146167557e-06, "loss": 0.004, "reward": 0.19049999117851257, "reward_std": 0.10507624596357346, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.03125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15924999117851257, "step": 329 }, { "completion_length": 172.28125, "epoch": 0.17656500802568217, "grad_norm": 441253.53125, "kl": 826.52392578125, "learning_rate": 2.9411764705882355e-06, "loss": 33.061, "reward": 0.19634374976158142, "reward_std": 0.2058209925889969, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.078125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11821874976158142, "step": 330 }, { "completion_length": 167.1875, "epoch": 0.17710005350454788, "grad_norm": 1.6360682249069214, "kl": 0.038716770708560944, "learning_rate": 2.950089126559715e-06, "loss": 0.0015, "reward": 0.3694687485694885, "reward_std": 0.5354635715484619, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11946874856948853, "step": 331 }, { "completion_length": 164.21875, "epoch": 0.1776350989834136, "grad_norm": 20.35577964782715, "kl": 0.5848493576049805, "learning_rate": 2.9590017825311946e-06, "loss": 0.0234, "reward": 0.39262500405311584, "reward_std": 0.572323739528656, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14262500405311584, "step": 332 }, { "completion_length": 158.21875, "epoch": 0.1781701444622793, "grad_norm": 14899199.0, "kl": 22027.619140625, "learning_rate": 2.9679144385026743e-06, "loss": 881.1047, "reward": 0.4635937511920929, "reward_std": 0.4074245095252991, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.109375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1667187511920929, "step": 333 }, { "completion_length": 162.90625, "epoch": 0.178705189941145, "grad_norm": 1.7296932935714722, "kl": 0.03836742788553238, "learning_rate": 2.9768270944741532e-06, "loss": 0.0015, "reward": 0.5199375152587891, "reward_std": 0.5745936632156372, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.203125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12931251525878906, "step": 334 }, { "completion_length": 177.03125, "epoch": 0.1792402354200107, "grad_norm": 26201.041015625, "kl": 66.0668716430664, "learning_rate": 2.985739750445633e-06, "loss": 2.6427, "reward": 0.3308437466621399, "reward_std": 0.35484713315963745, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.09375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11209375411272049, "step": 335 }, { "completion_length": 154.15625, "epoch": 0.1797752808988764, "grad_norm": 2.6687066555023193, "kl": 0.057273607701063156, "learning_rate": 2.9946524064171123e-06, "loss": 0.0023, "reward": 0.7671562433242798, "reward_std": 0.7282063961029053, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.21875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17340625822544098, "step": 336 }, { "completion_length": 173.21875, "epoch": 0.1803103263777421, "grad_norm": 1.237471580505371, "kl": 0.04546291381120682, "learning_rate": 3.003565062388592e-06, "loss": 0.0018, "reward": 0.40690624713897705, "reward_std": 0.5324851274490356, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.171875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11003124713897705, "step": 337 }, { "completion_length": 134.96875, "epoch": 0.18084537185660782, "grad_norm": 1.17357337474823, "kl": 0.03836257383227348, "learning_rate": 3.012477718360072e-06, "loss": 0.0015, "reward": 0.9293125867843628, "reward_std": 0.6919522285461426, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.171875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19493751227855682, "step": 338 }, { "completion_length": 142.25, "epoch": 0.18138041733547353, "grad_norm": 1.739876627922058, "kl": 0.12491313368082047, "learning_rate": 3.0213903743315507e-06, "loss": 0.005, "reward": 0.5740624666213989, "reward_std": 0.7064695358276367, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1990625113248825, "step": 339 }, { "completion_length": 163.375, "epoch": 0.1819154628143392, "grad_norm": 0.5370839238166809, "kl": 0.06175350770354271, "learning_rate": 3.0303030303030305e-06, "loss": 0.0025, "reward": 0.7913438081741333, "reward_std": 0.4564620852470398, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16634374856948853, "step": 340 }, { "completion_length": 155.21875, "epoch": 0.18245050829320492, "grad_norm": 1.7540959119796753, "kl": 0.03988974541425705, "learning_rate": 3.03921568627451e-06, "loss": 0.0016, "reward": 0.5687187314033508, "reward_std": 0.4641784727573395, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.140625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.16246874630451202, "step": 341 }, { "completion_length": 143.03125, "epoch": 0.18298555377207062, "grad_norm": 2.7355141639709473, "kl": 0.09511883556842804, "learning_rate": 3.0481283422459896e-06, "loss": 0.0038, "reward": 0.5922187566757202, "reward_std": 0.5445666313171387, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.15625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18596875667572021, "step": 342 }, { "completion_length": 132.03125, "epoch": 0.18352059925093633, "grad_norm": 9.531299591064453, "kl": 0.14579549431800842, "learning_rate": 3.0570409982174693e-06, "loss": 0.0058, "reward": 1.6479063034057617, "reward_std": 1.049806833267212, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.22603124380111694, "step": 343 }, { "completion_length": 150.65625, "epoch": 0.18405564472980204, "grad_norm": 4.007087230682373, "kl": 0.09768294543027878, "learning_rate": 3.0659536541889482e-06, "loss": 0.0039, "reward": 0.8305000066757202, "reward_std": 0.6810753345489502, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.21875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17424999177455902, "step": 344 }, { "completion_length": 176.875, "epoch": 0.18459069020866772, "grad_norm": 0.9832783341407776, "kl": 0.05737035721540451, "learning_rate": 3.074866310160428e-06, "loss": 0.0023, "reward": 0.5290000438690186, "reward_std": 0.5666155815124512, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.140625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13837499916553497, "step": 345 }, { "completion_length": 165.46875, "epoch": 0.18512573568753343, "grad_norm": 1.4155596494674683, "kl": 0.06498372554779053, "learning_rate": 3.0837789661319073e-06, "loss": 0.0026, "reward": 0.47971874475479126, "reward_std": 0.6761528849601746, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.15625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13596874475479126, "step": 346 }, { "completion_length": 153.34375, "epoch": 0.18566078116639914, "grad_norm": 1.0147651433944702, "kl": 0.053523678332567215, "learning_rate": 3.092691622103387e-06, "loss": 0.0021, "reward": 0.6935937404632568, "reward_std": 0.6526201963424683, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.140625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17796874046325684, "step": 347 }, { "completion_length": 157.21875, "epoch": 0.18619582664526485, "grad_norm": 1.0549206733703613, "kl": 0.03967276215553284, "learning_rate": 3.101604278074867e-06, "loss": 0.0016, "reward": 0.429562509059906, "reward_std": 0.7266602516174316, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.078125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.163937509059906, "step": 348 }, { "completion_length": 141.03125, "epoch": 0.18673087212413056, "grad_norm": 2.214430809020996, "kl": 0.03967590630054474, "learning_rate": 3.1105169340463458e-06, "loss": 0.0016, "reward": 0.4179999828338623, "reward_std": 0.5312068462371826, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.109375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1836249977350235, "step": 349 }, { "completion_length": 151.15625, "epoch": 0.18726591760299627, "grad_norm": 1.682969093322754, "kl": 0.05505958944559097, "learning_rate": 3.1194295900178255e-06, "loss": 0.0022, "reward": 0.5015312433242798, "reward_std": 0.43204864859580994, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.140625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17340624332427979, "step": 350 }, { "completion_length": 150.34375, "epoch": 0.18780096308186195, "grad_norm": 1.0874571800231934, "kl": 0.08755031228065491, "learning_rate": 3.128342245989305e-06, "loss": 0.0035, "reward": 0.3604687452316284, "reward_std": 0.5365550518035889, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.078125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15734374523162842, "step": 351 }, { "completion_length": 161.6875, "epoch": 0.18833600856072766, "grad_norm": 1.0363740921020508, "kl": 0.038686759769916534, "learning_rate": 3.1372549019607846e-06, "loss": 0.0015, "reward": 0.417843759059906, "reward_std": 0.39829227328300476, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.1875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.167843759059906, "step": 352 }, { "completion_length": 151.75, "epoch": 0.18887105403959337, "grad_norm": 1.881595492362976, "kl": 0.0684247761964798, "learning_rate": 3.1461675579322644e-06, "loss": 0.0027, "reward": 0.29475003480911255, "reward_std": 0.2644118666648865, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16975000500679016, "step": 353 }, { "completion_length": 162.21875, "epoch": 0.18940609951845908, "grad_norm": 1.8000438213348389, "kl": 0.059871163219213486, "learning_rate": 3.1550802139037433e-06, "loss": 0.0024, "reward": 0.8570000529289246, "reward_std": 0.5919697284698486, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.234375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12262500822544098, "step": 354 }, { "completion_length": 164.9375, "epoch": 0.18994114499732478, "grad_norm": 1.1148204803466797, "kl": 0.04210718721151352, "learning_rate": 3.163992869875223e-06, "loss": 0.0017, "reward": 0.22084374725818634, "reward_std": 0.16530151665210724, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15834376215934753, "step": 355 }, { "completion_length": 172.8125, "epoch": 0.19047619047619047, "grad_norm": 2.7306156158447266, "kl": 0.07564754784107208, "learning_rate": 3.1729055258467024e-06, "loss": 0.003, "reward": 0.41737499833106995, "reward_std": 0.5740177631378174, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.09375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13612499833106995, "step": 356 }, { "completion_length": 149.84375, "epoch": 0.19101123595505617, "grad_norm": 6039.61865234375, "kl": 22.919519424438477, "learning_rate": 3.181818181818182e-06, "loss": 0.9168, "reward": 0.9386249780654907, "reward_std": 0.9162291288375854, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18862499296665192, "step": 357 }, { "completion_length": 163.09375, "epoch": 0.19154628143392188, "grad_norm": 89070128.0, "kl": 1592087.25, "learning_rate": 3.190730837789662e-06, "loss": 63683.4922, "reward": 0.7713437080383301, "reward_std": 0.5630968809127808, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.171875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16196873784065247, "step": 358 }, { "completion_length": 127.625, "epoch": 0.1920813269127876, "grad_norm": 1.3437691926956177, "kl": 0.08369478583335876, "learning_rate": 3.1996434937611408e-06, "loss": 0.0033, "reward": 1.383906364440918, "reward_std": 1.0631163120269775, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21203124523162842, "step": 359 }, { "completion_length": 149.25, "epoch": 0.1926163723916533, "grad_norm": 2.276386260986328, "kl": 0.11770177632570267, "learning_rate": 3.2085561497326205e-06, "loss": 0.0047, "reward": 0.31856250762939453, "reward_std": 0.3618817925453186, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.078125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17793750762939453, "step": 360 }, { "completion_length": 152.59375, "epoch": 0.19315141787051898, "grad_norm": 3.7964773178100586, "kl": 0.06175083667039871, "learning_rate": 3.2174688057041003e-06, "loss": 0.0025, "reward": 0.7490625381469727, "reward_std": 0.6147525310516357, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.15625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15531250834465027, "step": 361 }, { "completion_length": 161.875, "epoch": 0.1936864633493847, "grad_norm": 2.097108840942383, "kl": 0.06278850138187408, "learning_rate": 3.2263814616755796e-06, "loss": 0.0025, "reward": 0.2835312485694885, "reward_std": 0.3288436532020569, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.078125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14290624856948853, "step": 362 }, { "completion_length": 177.84375, "epoch": 0.1942215088282504, "grad_norm": 3.4530811309814453, "kl": 0.06442040950059891, "learning_rate": 3.2352941176470594e-06, "loss": 0.0026, "reward": 0.39912497997283936, "reward_std": 0.586044192314148, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.171875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10224999487400055, "step": 363 }, { "completion_length": 146.5, "epoch": 0.1947565543071161, "grad_norm": 3.16377329826355, "kl": 0.08932147920131683, "learning_rate": 3.2442067736185383e-06, "loss": 0.0036, "reward": 0.722406268119812, "reward_std": 0.7729334831237793, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.15625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19115625321865082, "step": 364 }, { "completion_length": 155.15625, "epoch": 0.19529159978598182, "grad_norm": 1.0830423831939697, "kl": 0.06207633763551712, "learning_rate": 3.253119429590018e-06, "loss": 0.0025, "reward": 0.8881250023841858, "reward_std": 0.1682538390159607, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.21875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1693749725818634, "step": 365 }, { "completion_length": 135.34375, "epoch": 0.1958266452648475, "grad_norm": 6015.51806640625, "kl": 18.821727752685547, "learning_rate": 3.262032085561498e-06, "loss": 0.7529, "reward": 1.0599374771118164, "reward_std": 0.6198018193244934, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.21875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2161874920129776, "step": 366 }, { "completion_length": 157.75, "epoch": 0.1963616907437132, "grad_norm": 1.0033446550369263, "kl": 0.06731581687927246, "learning_rate": 3.270944741532977e-06, "loss": 0.0027, "reward": 0.4950312376022339, "reward_std": 0.6611825227737427, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.15625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15128125250339508, "step": 367 }, { "completion_length": 156.5625, "epoch": 0.19689673622257892, "grad_norm": 19950.802734375, "kl": 111.01970672607422, "learning_rate": 3.279857397504457e-06, "loss": 4.4408, "reward": 0.9723437428474426, "reward_std": 0.8879813551902771, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.234375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.15984374284744263, "step": 368 }, { "completion_length": 154.09375, "epoch": 0.19743178170144463, "grad_norm": 15.173202514648438, "kl": 0.09775897115468979, "learning_rate": 3.288770053475936e-06, "loss": 0.0039, "reward": 0.433656245470047, "reward_std": 0.40152508020401, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.183656245470047, "step": 369 }, { "completion_length": 163.625, "epoch": 0.19796682718031033, "grad_norm": 0.6688660383224487, "kl": 0.039983317255973816, "learning_rate": 3.2976827094474156e-06, "loss": 0.0016, "reward": 0.2850312292575836, "reward_std": 0.19180038571357727, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.140625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1444062441587448, "step": 370 }, { "completion_length": 154.625, "epoch": 0.19850187265917604, "grad_norm": 4467.6845703125, "kl": 20.138042449951172, "learning_rate": 3.3065953654188953e-06, "loss": 0.8055, "reward": 0.43650001287460327, "reward_std": 0.4185536503791809, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.15625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15524999797344208, "step": 371 }, { "completion_length": 158.34375, "epoch": 0.19903691813804172, "grad_norm": 1.5125184059143066, "kl": 0.046494193375110626, "learning_rate": 3.3155080213903747e-06, "loss": 0.0019, "reward": 0.30787500739097595, "reward_std": 0.3915994167327881, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18287499248981476, "step": 372 }, { "completion_length": 175.28125, "epoch": 0.19957196361690743, "grad_norm": 1665.6314697265625, "kl": 4.611397743225098, "learning_rate": 3.3244206773618544e-06, "loss": 0.1845, "reward": 0.2082812488079071, "reward_std": 0.14436443150043488, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.078125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1301562488079071, "step": 373 }, { "completion_length": 152.03125, "epoch": 0.20010700909577314, "grad_norm": 12631.3193359375, "kl": 144.2276153564453, "learning_rate": 3.3333333333333333e-06, "loss": 5.7691, "reward": 0.4961875379085541, "reward_std": 0.7844967842102051, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.09375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.1368125081062317, "step": 374 }, { "completion_length": 158.84375, "epoch": 0.20064205457463885, "grad_norm": 2.1059250831604004, "kl": 0.05961036682128906, "learning_rate": 3.342245989304813e-06, "loss": 0.0024, "reward": 0.4362812638282776, "reward_std": 0.4950805902481079, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.140625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1706562489271164, "step": 375 }, { "completion_length": 163.28125, "epoch": 0.20117710005350456, "grad_norm": 1.0990577936172485, "kl": 0.04674486070871353, "learning_rate": 3.351158645276293e-06, "loss": 0.0019, "reward": 0.39771872758865356, "reward_std": 0.4126867353916168, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.1875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14771875739097595, "step": 376 }, { "completion_length": 166.8125, "epoch": 0.20171214553237024, "grad_norm": 0.858343243598938, "kl": 0.04805570840835571, "learning_rate": 3.360071301247772e-06, "loss": 0.0019, "reward": 0.34840622544288635, "reward_std": 0.31840887665748596, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.09375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12965625524520874, "step": 377 }, { "completion_length": 152.4375, "epoch": 0.20224719101123595, "grad_norm": 27938122.0, "kl": 44383.72265625, "learning_rate": 3.368983957219252e-06, "loss": 1775.3488, "reward": 0.5154687166213989, "reward_std": 0.39702463150024414, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.1875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2029687464237213, "step": 378 }, { "completion_length": 154.59375, "epoch": 0.20278223649010166, "grad_norm": 2.10695743560791, "kl": 0.05904068797826767, "learning_rate": 3.377896613190731e-06, "loss": 0.0024, "reward": 0.7843749523162842, "reward_std": 0.7361736297607422, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.234375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17499999701976776, "step": 379 }, { "completion_length": 143.1875, "epoch": 0.20331728196896737, "grad_norm": 367.06414794921875, "kl": 3.67225980758667, "learning_rate": 3.3868092691622106e-06, "loss": 0.1469, "reward": 0.7573124766349792, "reward_std": 0.8564784526824951, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.203125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17918749153614044, "step": 380 }, { "completion_length": 171.90625, "epoch": 0.20385232744783308, "grad_norm": 0.9975336194038391, "kl": 0.05872911214828491, "learning_rate": 3.3957219251336904e-06, "loss": 0.0023, "reward": 0.7367812395095825, "reward_std": 0.9359557032585144, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.15625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1430312544107437, "step": 381 }, { "completion_length": 129.9375, "epoch": 0.20438737292669876, "grad_norm": 0.998724102973938, "kl": 0.09988349676132202, "learning_rate": 3.4046345811051697e-06, "loss": 0.004, "reward": 0.9097188115119934, "reward_std": 0.5699847936630249, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.21875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19096875190734863, "step": 382 }, { "completion_length": 157.3125, "epoch": 0.20492241840556447, "grad_norm": 2.935628652572632, "kl": 0.05450469255447388, "learning_rate": 3.4135472370766494e-06, "loss": 0.0022, "reward": 0.8240000009536743, "reward_std": 0.5832922458648682, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18337500095367432, "step": 383 }, { "completion_length": 148.90625, "epoch": 0.20545746388443017, "grad_norm": 1.2152330875396729, "kl": 0.08282585442066193, "learning_rate": 3.4224598930481284e-06, "loss": 0.0033, "reward": 0.7319375276565552, "reward_std": 0.7569752931594849, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1850624978542328, "step": 384 }, { "completion_length": 179.03125, "epoch": 0.20599250936329588, "grad_norm": 0.8825836181640625, "kl": 0.051881566643714905, "learning_rate": 3.431372549019608e-06, "loss": 0.0021, "reward": 0.17734375596046448, "reward_std": 0.10502129048109055, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.046875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13046875596046448, "step": 385 }, { "completion_length": 160.75, "epoch": 0.2065275548421616, "grad_norm": 0.7615184783935547, "kl": 0.04345859959721565, "learning_rate": 3.440285204991088e-06, "loss": 0.0017, "reward": 0.6891562938690186, "reward_std": 0.4746515154838562, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.234375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14228126406669617, "step": 386 }, { "completion_length": 160.375, "epoch": 0.20706260032102727, "grad_norm": 2.2496612071990967, "kl": 0.12060196697711945, "learning_rate": 3.449197860962567e-06, "loss": 0.0048, "reward": 0.5349375009536743, "reward_std": 0.6299932599067688, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.171875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17556250095367432, "step": 387 }, { "completion_length": 149.625, "epoch": 0.20759764579989298, "grad_norm": 0.7596875429153442, "kl": 0.07657285034656525, "learning_rate": 3.458110516934047e-06, "loss": 0.0031, "reward": 0.9290000200271606, "reward_std": 0.6813415288925171, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21024999022483826, "step": 388 }, { "completion_length": 154.03125, "epoch": 0.2081326912787587, "grad_norm": 0.8115154504776001, "kl": 0.05895604193210602, "learning_rate": 3.467023172905526e-06, "loss": 0.0024, "reward": 1.0003750324249268, "reward_std": 0.8954555988311768, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.203125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17225000262260437, "step": 389 }, { "completion_length": 143.75, "epoch": 0.2086677367576244, "grad_norm": 1.027033805847168, "kl": 0.07509717345237732, "learning_rate": 3.4759358288770056e-06, "loss": 0.003, "reward": 0.9259687662124634, "reward_std": 0.8307575583457947, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.234375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19159376621246338, "step": 390 }, { "completion_length": 145.78125, "epoch": 0.2092027822364901, "grad_norm": 2.190568685531616, "kl": 0.08470791578292847, "learning_rate": 3.4848484848484854e-06, "loss": 0.0034, "reward": 0.6339062452316284, "reward_std": 0.44953715801239014, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1964062601327896, "step": 391 }, { "completion_length": 132.21875, "epoch": 0.20973782771535582, "grad_norm": 26.821184158325195, "kl": 0.37751680612564087, "learning_rate": 3.4937611408199647e-06, "loss": 0.0151, "reward": 1.024999976158142, "reward_std": 0.6861128807067871, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19687500596046448, "step": 392 }, { "completion_length": 149.09375, "epoch": 0.2102728731942215, "grad_norm": 0.9392027258872986, "kl": 0.08183514326810837, "learning_rate": 3.5026737967914445e-06, "loss": 0.0033, "reward": 0.5031562447547913, "reward_std": 0.3347555100917816, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19065625965595245, "step": 393 }, { "completion_length": 148.09375, "epoch": 0.2108079186730872, "grad_norm": 1.167720079421997, "kl": 0.06730467081069946, "learning_rate": 3.5115864527629234e-06, "loss": 0.0027, "reward": 0.6300936937332153, "reward_std": 0.8351419568061829, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.140625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1769687533378601, "step": 394 }, { "completion_length": 157.78125, "epoch": 0.21134296415195292, "grad_norm": 2.2186217308044434, "kl": 0.06219214200973511, "learning_rate": 3.520499108734403e-06, "loss": 0.0025, "reward": 0.2757500410079956, "reward_std": 0.1753721833229065, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.109375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16637499630451202, "step": 395 }, { "completion_length": 137.4375, "epoch": 0.21187800963081863, "grad_norm": 1.0273642539978027, "kl": 0.10750111192464828, "learning_rate": 3.529411764705883e-06, "loss": 0.0043, "reward": 0.7856874465942383, "reward_std": 0.7239871621131897, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.21875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19193750619888306, "step": 396 }, { "completion_length": 125.28125, "epoch": 0.21241305510968433, "grad_norm": 2.0299174785614014, "kl": 0.08041425049304962, "learning_rate": 3.5383244206773622e-06, "loss": 0.0032, "reward": 0.6512500047683716, "reward_std": 0.33647578954696655, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.234375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.22937500476837158, "step": 397 }, { "completion_length": 142.09375, "epoch": 0.21294810058855002, "grad_norm": 1.4907788038253784, "kl": 0.06902605295181274, "learning_rate": 3.5472370766488416e-06, "loss": 0.0028, "reward": 1.108718752861023, "reward_std": 0.8318840265274048, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18684375286102295, "step": 398 }, { "completion_length": 168.6875, "epoch": 0.21348314606741572, "grad_norm": 1.1850415468215942, "kl": 0.04624190181493759, "learning_rate": 3.556149732620321e-06, "loss": 0.0018, "reward": 0.3310624957084656, "reward_std": 0.3213735818862915, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.140625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12793749570846558, "step": 399 }, { "completion_length": 164.5625, "epoch": 0.21401819154628143, "grad_norm": 1.3587532043457031, "kl": 0.07140326499938965, "learning_rate": 3.5650623885918006e-06, "loss": 0.0029, "reward": 0.8816875219345093, "reward_std": 0.8229385614395142, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.171875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.1316874921321869, "step": 400 }, { "completion_length": 150.71875, "epoch": 0.21455323702514714, "grad_norm": 1.1401692628860474, "kl": 0.06650268286466599, "learning_rate": 3.5739750445632804e-06, "loss": 0.0027, "reward": 0.42018750309944153, "reward_std": 0.4202556610107422, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.109375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18581248819828033, "step": 401 }, { "completion_length": 131.8125, "epoch": 0.21508828250401285, "grad_norm": 231068024832.0, "kl": 670268672.0, "learning_rate": 3.5828877005347597e-06, "loss": 26810748.0, "reward": 1.0632500648498535, "reward_std": 1.008892297744751, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.203125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.21950000524520874, "step": 402 }, { "completion_length": 173.65625, "epoch": 0.21562332798287853, "grad_norm": 0.724962055683136, "kl": 0.06039568781852722, "learning_rate": 3.591800356506239e-06, "loss": 0.0024, "reward": 0.6573125123977661, "reward_std": 0.742343008518219, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.1875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15731249749660492, "step": 403 }, { "completion_length": 151.125, "epoch": 0.21615837346174424, "grad_norm": 9.222336769104004, "kl": 0.0490996390581131, "learning_rate": 3.6007130124777184e-06, "loss": 0.002, "reward": 0.8605937957763672, "reward_std": 0.8701027035713196, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1730937510728836, "step": 404 }, { "completion_length": 117.625, "epoch": 0.21669341894060995, "grad_norm": 132191264.0, "kl": 2216007.25, "learning_rate": 3.609625668449198e-06, "loss": 88640.2969, "reward": 1.2820625305175781, "reward_std": 0.7376642227172852, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.23518750071525574, "step": 405 }, { "completion_length": 135.1875, "epoch": 0.21722846441947566, "grad_norm": 8.905828475952148, "kl": 0.20305079221725464, "learning_rate": 3.618538324420678e-06, "loss": 0.0081, "reward": 0.5499062538146973, "reward_std": 0.5272085666656494, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.171875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19053123891353607, "step": 406 }, { "completion_length": 162.65625, "epoch": 0.21776350989834137, "grad_norm": 4.404787540435791, "kl": 0.06777146458625793, "learning_rate": 3.6274509803921573e-06, "loss": 0.0027, "reward": 0.9080937504768372, "reward_std": 0.6069066524505615, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.21875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": 0.15809375047683716, "step": 407 }, { "completion_length": 145.84375, "epoch": 0.21829855537720708, "grad_norm": 1.7046852111816406, "kl": 0.09583885967731476, "learning_rate": 3.6363636363636366e-06, "loss": 0.0038, "reward": 1.1082500219345093, "reward_std": 0.9744662642478943, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21762500703334808, "step": 408 }, { "completion_length": 164.53125, "epoch": 0.21883360085607276, "grad_norm": 0.7750149965286255, "kl": 0.06663526594638824, "learning_rate": 3.645276292335116e-06, "loss": 0.0027, "reward": 0.33668750524520874, "reward_std": 0.26995059847831726, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.1875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14918750524520874, "step": 409 }, { "completion_length": 171.09375, "epoch": 0.21936864633493847, "grad_norm": 0.7309465408325195, "kl": 0.06418268382549286, "learning_rate": 3.6541889483065957e-06, "loss": 0.0026, "reward": 0.5267812609672546, "reward_std": 0.5765761137008667, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.171875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.15178124606609344, "step": 410 }, { "completion_length": 156.4375, "epoch": 0.21990369181380418, "grad_norm": 18872536.0, "kl": 2048162.5, "learning_rate": 3.6631016042780754e-06, "loss": 81926.5, "reward": 0.8568124771118164, "reward_std": 0.7441855669021606, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1536875069141388, "step": 411 }, { "completion_length": 150.28125, "epoch": 0.22043873729266988, "grad_norm": 35.76652908325195, "kl": 0.1222686842083931, "learning_rate": 3.6720142602495543e-06, "loss": 0.0049, "reward": 0.6600937843322754, "reward_std": 0.5840851068496704, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.171875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1757187396287918, "step": 412 }, { "completion_length": 182.15625, "epoch": 0.2209737827715356, "grad_norm": 7.335886001586914, "kl": 0.06980093568563461, "learning_rate": 3.680926916221034e-06, "loss": 0.0028, "reward": 0.6388437747955322, "reward_std": 0.7511411905288696, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.1875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13884374499320984, "step": 413 }, { "completion_length": 129.84375, "epoch": 0.22150882825040127, "grad_norm": 1.6697489023208618, "kl": 0.09830697625875473, "learning_rate": 3.6898395721925134e-06, "loss": 0.0039, "reward": 1.3255624771118164, "reward_std": 0.9332460165023804, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2318125069141388, "step": 414 }, { "completion_length": 170.125, "epoch": 0.22204387372926698, "grad_norm": 1.2443536520004272, "kl": 0.08333319425582886, "learning_rate": 3.698752228163993e-06, "loss": 0.0033, "reward": 0.7594375014305115, "reward_std": 0.5759878158569336, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.21875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16568750143051147, "step": 415 }, { "completion_length": 152.84375, "epoch": 0.2225789192081327, "grad_norm": 1.3193209171295166, "kl": 0.13641339540481567, "learning_rate": 3.707664884135473e-06, "loss": 0.0055, "reward": 0.8958437442779541, "reward_std": 0.6995431184768677, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458437442779541, "step": 416 }, { "completion_length": 144.0625, "epoch": 0.2231139646869984, "grad_norm": 8.693076133728027, "kl": 0.14373931288719177, "learning_rate": 3.716577540106952e-06, "loss": 0.0057, "reward": 1.2131874561309814, "reward_std": 0.8317480087280273, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18193750083446503, "step": 417 }, { "completion_length": 145.5625, "epoch": 0.2236490101658641, "grad_norm": 1.7000956535339355, "kl": 0.11696180701255798, "learning_rate": 3.7254901960784316e-06, "loss": 0.0047, "reward": 0.7127187252044678, "reward_std": 0.5861781239509583, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18146874010562897, "step": 418 }, { "completion_length": 154.9375, "epoch": 0.2241840556447298, "grad_norm": 8.60641860961914, "kl": 0.1837363839149475, "learning_rate": 3.734402852049911e-06, "loss": 0.0073, "reward": 0.820968747138977, "reward_std": 0.7411601543426514, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19596873223781586, "step": 419 }, { "completion_length": 169.0, "epoch": 0.2247191011235955, "grad_norm": 1.0074621438980103, "kl": 0.08353328704833984, "learning_rate": 3.7433155080213907e-06, "loss": 0.0033, "reward": 0.25612500309944153, "reward_std": 0.17604310810565948, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.109375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14675000309944153, "step": 420 }, { "completion_length": 163.09375, "epoch": 0.2252541466024612, "grad_norm": 130.96495056152344, "kl": 0.23350626230239868, "learning_rate": 3.7522281639928705e-06, "loss": 0.0093, "reward": 0.733093798160553, "reward_std": 0.4199409782886505, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.1875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1705937534570694, "step": 421 }, { "completion_length": 144.71875, "epoch": 0.22578919208132692, "grad_norm": 7.070648193359375, "kl": 0.22713924944400787, "learning_rate": 3.7611408199643494e-06, "loss": 0.0091, "reward": 0.46299999952316284, "reward_std": 0.29345083236694336, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.203125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19737498462200165, "step": 422 }, { "completion_length": 132.21875, "epoch": 0.22632423756019263, "grad_norm": 3.94770884513855, "kl": 0.20836225152015686, "learning_rate": 3.770053475935829e-06, "loss": 0.0083, "reward": 1.4321874380111694, "reward_std": 0.4229949414730072, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2134374976158142, "step": 423 }, { "completion_length": 151.15625, "epoch": 0.2268592830390583, "grad_norm": 1598270336.0, "kl": 147349472.0, "learning_rate": 3.7789661319073085e-06, "loss": 5893979.0, "reward": 0.7567812204360962, "reward_std": 0.48753622174263, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.234375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14740625023841858, "step": 424 }, { "completion_length": 119.40625, "epoch": 0.22739432851792402, "grad_norm": 1.618298053741455, "kl": 0.08995432406663895, "learning_rate": 3.7878787878787882e-06, "loss": 0.0036, "reward": 0.9883750081062317, "reward_std": 0.1971924901008606, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.203125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2852500081062317, "step": 425 }, { "completion_length": 128.8125, "epoch": 0.22792937399678972, "grad_norm": 1.2307651042938232, "kl": 0.07911631464958191, "learning_rate": 3.796791443850268e-06, "loss": 0.0032, "reward": 0.9888124465942383, "reward_std": 0.7288287878036499, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.22318750619888306, "step": 426 }, { "completion_length": 155.4375, "epoch": 0.22846441947565543, "grad_norm": 4.995761871337891, "kl": 0.12133737653493881, "learning_rate": 3.805704099821747e-06, "loss": 0.0049, "reward": 0.44884371757507324, "reward_std": 0.3552466034889221, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.078125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18321873247623444, "step": 427 }, { "completion_length": 140.8125, "epoch": 0.22899946495452114, "grad_norm": 2.05088472366333, "kl": 0.06339698284864426, "learning_rate": 3.8146167557932266e-06, "loss": 0.0025, "reward": 0.5165937542915344, "reward_std": 0.4758226275444031, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.09375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17284375429153442, "step": 428 }, { "completion_length": 157.46875, "epoch": 0.22953451043338685, "grad_norm": 1.6343135833740234, "kl": 0.14831136167049408, "learning_rate": 3.8235294117647055e-06, "loss": 0.0059, "reward": 0.8221875429153442, "reward_std": 0.8032755851745605, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19718751311302185, "step": 429 }, { "completion_length": 125.3125, "epoch": 0.23006955591225253, "grad_norm": 3.155555486679077, "kl": 0.13008983433246613, "learning_rate": 3.832442067736185e-06, "loss": 0.0052, "reward": 1.2516875267028809, "reward_std": 1.1171207427978516, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20481249690055847, "step": 430 }, { "completion_length": 144.5, "epoch": 0.23060460139111824, "grad_norm": 3.118886947631836, "kl": 0.17973592877388, "learning_rate": 3.841354723707665e-06, "loss": 0.0072, "reward": 0.31056249141693115, "reward_std": 0.328213095664978, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18556249141693115, "step": 431 }, { "completion_length": 138.65625, "epoch": 0.23113964686998395, "grad_norm": 1.9289883375167847, "kl": 0.11276096850633621, "learning_rate": 3.850267379679145e-06, "loss": 0.0045, "reward": 0.8558437824249268, "reward_std": 0.4497666358947754, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19959375262260437, "step": 432 }, { "completion_length": 179.65625, "epoch": 0.23167469234884966, "grad_norm": 161038.25, "kl": 566.84765625, "learning_rate": 3.8591800356506246e-06, "loss": 22.6739, "reward": 0.4424999952316284, "reward_std": 0.5752555727958679, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.171875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14562499523162842, "step": 433 }, { "completion_length": 140.25, "epoch": 0.23220973782771537, "grad_norm": 2.8554468154907227, "kl": 0.12336809188127518, "learning_rate": 3.8680926916221035e-06, "loss": 0.0049, "reward": 0.9511561989784241, "reward_std": 0.40460366010665894, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20115625858306885, "step": 434 }, { "completion_length": 120.0625, "epoch": 0.23274478330658105, "grad_norm": 1.9395747184753418, "kl": 0.1320873200893402, "learning_rate": 3.877005347593583e-06, "loss": 0.0053, "reward": 1.4481875896453857, "reward_std": 0.9140652418136597, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.24506250023841858, "step": 435 }, { "completion_length": 138.28125, "epoch": 0.23327982878544676, "grad_norm": 2.1811065673828125, "kl": 0.1228511780500412, "learning_rate": 3.885918003565063e-06, "loss": 0.0049, "reward": 1.4114062786102295, "reward_std": 1.206470251083374, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2239062488079071, "step": 436 }, { "completion_length": 134.53125, "epoch": 0.23381487426431247, "grad_norm": 982552128.0, "kl": 1443061.875, "learning_rate": 3.894830659536542e-06, "loss": 57722.4727, "reward": 0.9601874947547913, "reward_std": 0.5963510274887085, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19456249475479126, "step": 437 }, { "completion_length": 137.40625, "epoch": 0.23434991974317818, "grad_norm": 17.363534927368164, "kl": 0.24647867679595947, "learning_rate": 3.903743315508022e-06, "loss": 0.0099, "reward": 0.8397812843322754, "reward_std": 0.44314175844192505, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.214781254529953, "step": 438 }, { "completion_length": 129.375, "epoch": 0.23488496522204388, "grad_norm": 5.0381903648376465, "kl": 0.11715066432952881, "learning_rate": 3.9126559714795006e-06, "loss": 0.0047, "reward": 1.165374994277954, "reward_std": 0.9414811134338379, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2278750091791153, "step": 439 }, { "completion_length": 147.78125, "epoch": 0.23542001070090957, "grad_norm": 2.2389307022094727, "kl": 0.09746487438678741, "learning_rate": 3.92156862745098e-06, "loss": 0.0039, "reward": 0.790093719959259, "reward_std": 0.4212659001350403, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.234375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18071874976158142, "step": 440 }, { "completion_length": 142.3125, "epoch": 0.23595505617977527, "grad_norm": 0.6690593361854553, "kl": 0.11504049599170685, "learning_rate": 3.93048128342246e-06, "loss": 0.0046, "reward": 0.9824374914169312, "reward_std": 0.8469048738479614, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20118749141693115, "step": 441 }, { "completion_length": 153.03125, "epoch": 0.23649010165864098, "grad_norm": 5.483129978179932, "kl": 0.18441715836524963, "learning_rate": 3.93939393939394e-06, "loss": 0.0074, "reward": 0.6807500123977661, "reward_std": 0.6634962558746338, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.15625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14949999749660492, "step": 442 }, { "completion_length": 128.3125, "epoch": 0.2370251471375067, "grad_norm": 9.106139183044434, "kl": 0.2317838966846466, "learning_rate": 3.94830659536542e-06, "loss": 0.0093, "reward": 0.9943749904632568, "reward_std": 0.5203291177749634, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.22875000536441803, "step": 443 }, { "completion_length": 126.0625, "epoch": 0.2375601926163724, "grad_norm": 1.6779062747955322, "kl": 0.12769877910614014, "learning_rate": 3.957219251336899e-06, "loss": 0.0051, "reward": 1.4320000410079956, "reward_std": 0.7979803085327148, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21324999630451202, "step": 444 }, { "completion_length": 146.96875, "epoch": 0.23809523809523808, "grad_norm": 1.5396405458450317, "kl": 0.11285427212715149, "learning_rate": 3.966131907308378e-06, "loss": 0.0045, "reward": 0.5566875338554382, "reward_std": 0.7217659950256348, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.140625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.22856250405311584, "step": 445 }, { "completion_length": 155.65625, "epoch": 0.2386302835741038, "grad_norm": 1.4547637701034546, "kl": 0.07737045735120773, "learning_rate": 3.975044563279858e-06, "loss": 0.0031, "reward": 0.9383749961853027, "reward_std": 0.8399878740310669, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15712501108646393, "step": 446 }, { "completion_length": 159.0, "epoch": 0.2391653290529695, "grad_norm": 2.5407557487487793, "kl": 0.05951354280114174, "learning_rate": 3.983957219251337e-06, "loss": 0.0024, "reward": 0.5178749561309814, "reward_std": 0.5784991383552551, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.140625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18974998593330383, "step": 447 }, { "completion_length": 140.40625, "epoch": 0.2397003745318352, "grad_norm": 13960.57421875, "kl": 33.228553771972656, "learning_rate": 3.992869875222817e-06, "loss": 1.3291, "reward": 1.3062187433242798, "reward_std": 0.9482384920120239, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.21246874332427979, "step": 448 }, { "completion_length": 146.4375, "epoch": 0.24023542001070092, "grad_norm": 2.458956241607666, "kl": 0.17197276651859283, "learning_rate": 4.0017825311942964e-06, "loss": 0.0069, "reward": 0.8197500109672546, "reward_std": 0.6244428157806396, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19475001096725464, "step": 449 }, { "completion_length": 142.9375, "epoch": 0.24077046548956663, "grad_norm": 11.737149238586426, "kl": 0.36909082531929016, "learning_rate": 4.010695187165775e-06, "loss": 0.0148, "reward": 0.6955000162124634, "reward_std": 0.7305401563644409, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.1875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19550000131130219, "step": 450 }, { "completion_length": 152.53125, "epoch": 0.2413055109684323, "grad_norm": 10.936800003051758, "kl": 0.3330758810043335, "learning_rate": 4.019607843137255e-06, "loss": 0.0133, "reward": 0.8229374885559082, "reward_std": 0.6137935519218445, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1823125034570694, "step": 451 }, { "completion_length": 135.15625, "epoch": 0.24184055644729802, "grad_norm": 465023136.0, "kl": 11173654.0, "learning_rate": 4.028520499108735e-06, "loss": 446946.1562, "reward": 0.6354687213897705, "reward_std": 0.6207796335220337, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.15625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2292187511920929, "step": 452 }, { "completion_length": 139.25, "epoch": 0.24237560192616373, "grad_norm": 0.6546344757080078, "kl": 0.07999764382839203, "learning_rate": 4.037433155080215e-06, "loss": 0.0032, "reward": 0.7404062747955322, "reward_std": 0.7575255632400513, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20915624499320984, "step": 453 }, { "completion_length": 142.96875, "epoch": 0.24291064740502943, "grad_norm": 1.837325930595398, "kl": 0.10369715839624405, "learning_rate": 4.046345811051694e-06, "loss": 0.0041, "reward": 0.555593729019165, "reward_std": 0.37927401065826416, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21184374392032623, "step": 454 }, { "completion_length": 163.03125, "epoch": 0.24344569288389514, "grad_norm": 0.9788931012153625, "kl": 0.080841563642025, "learning_rate": 4.055258467023173e-06, "loss": 0.0032, "reward": 0.44075000286102295, "reward_std": 0.43621763586997986, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.15625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15950000286102295, "step": 455 }, { "completion_length": 165.46875, "epoch": 0.24398073836276082, "grad_norm": 1.7496691942214966, "kl": 0.05813451111316681, "learning_rate": 4.064171122994653e-06, "loss": 0.0023, "reward": 0.7682499885559082, "reward_std": 0.5532603859901428, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.234375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1588750183582306, "step": 456 }, { "completion_length": 145.5625, "epoch": 0.24451578384162653, "grad_norm": 126.00261688232422, "kl": 0.20764541625976562, "learning_rate": 4.073083778966132e-06, "loss": 0.0083, "reward": 0.8394062519073486, "reward_std": 0.882066011428833, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.21875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18315623700618744, "step": 457 }, { "completion_length": 151.625, "epoch": 0.24505082932049224, "grad_norm": 1.6813839673995972, "kl": 0.08535502105951309, "learning_rate": 4.081996434937612e-06, "loss": 0.0034, "reward": 0.875781238079071, "reward_std": 0.542606770992279, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.20390625298023224, "step": 458 }, { "completion_length": 134.9375, "epoch": 0.24558587479935795, "grad_norm": 2.8341739177703857, "kl": 0.11339397728443146, "learning_rate": 4.0909090909090915e-06, "loss": 0.0045, "reward": 1.3663437366485596, "reward_std": 0.9395400881767273, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21009375154972076, "step": 459 }, { "completion_length": 135.65625, "epoch": 0.24612092027822366, "grad_norm": 2.5547523498535156, "kl": 0.0672583356499672, "learning_rate": 4.09982174688057e-06, "loss": 0.0027, "reward": 0.8996250033378601, "reward_std": 0.8782879710197449, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": 0.2277500033378601, "step": 460 }, { "completion_length": 138.125, "epoch": 0.24665596575708934, "grad_norm": 0.6568250060081482, "kl": 0.06774581223726273, "learning_rate": 4.10873440285205e-06, "loss": 0.0027, "reward": 1.4528436660766602, "reward_std": 1.1155022382736206, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": 0.21846875548362732, "step": 461 }, { "completion_length": 138.625, "epoch": 0.24719101123595505, "grad_norm": 1.633440375328064, "kl": 0.07181629538536072, "learning_rate": 4.11764705882353e-06, "loss": 0.0029, "reward": 1.2238438129425049, "reward_std": 0.6271721124649048, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2238437384366989, "step": 462 }, { "completion_length": 157.5, "epoch": 0.24772605671482076, "grad_norm": 0.6720988154411316, "kl": 0.08336509764194489, "learning_rate": 4.12655971479501e-06, "loss": 0.0033, "reward": 1.0100938081741333, "reward_std": 0.9382752180099487, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18196874856948853, "step": 463 }, { "completion_length": 166.09375, "epoch": 0.24826110219368647, "grad_norm": 4.373018741607666, "kl": 0.31937021017074585, "learning_rate": 4.135472370766489e-06, "loss": 0.0128, "reward": 0.7038750052452087, "reward_std": 0.5307610034942627, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.18825000524520874, "step": 464 }, { "completion_length": 130.0625, "epoch": 0.24879614767255218, "grad_norm": 2.3026788234710693, "kl": 0.12289442867040634, "learning_rate": 4.144385026737968e-06, "loss": 0.0049, "reward": 1.5184062719345093, "reward_std": 0.647371768951416, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.22153125703334808, "step": 465 }, { "completion_length": 165.9375, "epoch": 0.24933119315141786, "grad_norm": 0.6396226286888123, "kl": 0.06164398789405823, "learning_rate": 4.153297682709448e-06, "loss": 0.0025, "reward": 0.5064374804496765, "reward_std": 0.49466535449028015, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.171875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1470624953508377, "step": 466 }, { "completion_length": 149.03125, "epoch": 0.24986623863028357, "grad_norm": 2.3049519062042236, "kl": 0.0614023432135582, "learning_rate": 4.162210338680927e-06, "loss": 0.0025, "reward": 0.9122812747955322, "reward_std": 0.614997386932373, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19353124499320984, "step": 467 }, { "completion_length": 170.34375, "epoch": 0.2504012841091493, "grad_norm": 2.0957796573638916, "kl": 0.08489249646663666, "learning_rate": 4.171122994652407e-06, "loss": 0.0034, "reward": 0.4255000054836273, "reward_std": 0.5121064186096191, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.140625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15987500548362732, "step": 468 }, { "completion_length": 131.65625, "epoch": 0.250936329588015, "grad_norm": 1.1716562509536743, "kl": 0.06173257529735565, "learning_rate": 4.1800356506238865e-06, "loss": 0.0025, "reward": 0.8534375429153442, "reward_std": 0.6739568710327148, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21281249821186066, "step": 469 }, { "completion_length": 155.40625, "epoch": 0.25147137506688066, "grad_norm": 6.024100303649902, "kl": 0.08114910870790482, "learning_rate": 4.188948306595365e-06, "loss": 0.0032, "reward": 0.8482812643051147, "reward_std": 0.7549842596054077, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17640623450279236, "step": 470 }, { "completion_length": 129.21875, "epoch": 0.2520064205457464, "grad_norm": 1.3793138265609741, "kl": 0.11503198742866516, "learning_rate": 4.197860962566845e-06, "loss": 0.0046, "reward": 0.8730937242507935, "reward_std": 0.705551266670227, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.24809375405311584, "step": 471 }, { "completion_length": 135.4375, "epoch": 0.2525414660246121, "grad_norm": 2.791792869567871, "kl": 0.10892768204212189, "learning_rate": 4.206773618538325e-06, "loss": 0.0044, "reward": 0.8082187175750732, "reward_std": 0.48940038681030273, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.23009374737739563, "step": 472 }, { "completion_length": 115.21875, "epoch": 0.2530765115034778, "grad_norm": 2.0844473838806152, "kl": 0.09558313339948654, "learning_rate": 4.215686274509805e-06, "loss": 0.0038, "reward": 1.7823437452316284, "reward_std": 0.9494414329528809, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2667187452316284, "step": 473 }, { "completion_length": 120.25, "epoch": 0.2536115569823435, "grad_norm": 0.5689305067062378, "kl": 0.14149996638298035, "learning_rate": 4.224598930481284e-06, "loss": 0.0057, "reward": 1.4212501049041748, "reward_std": 0.8075989484786987, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.26500001549720764, "step": 474 }, { "completion_length": 152.0625, "epoch": 0.2541466024612092, "grad_norm": 9.642892837524414, "kl": 0.10378755629062653, "learning_rate": 4.233511586452763e-06, "loss": 0.0042, "reward": 0.9934999942779541, "reward_std": 0.5705205202102661, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1809999942779541, "step": 475 }, { "completion_length": 139.53125, "epoch": 0.2546816479400749, "grad_norm": 2.72552490234375, "kl": 0.16671186685562134, "learning_rate": 4.242424242424243e-06, "loss": 0.0067, "reward": 1.5052499771118164, "reward_std": 1.2377991676330566, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.2239999920129776, "step": 476 }, { "completion_length": 136.125, "epoch": 0.2552166934189406, "grad_norm": 5054130.0, "kl": 1665.8043212890625, "learning_rate": 4.251336898395722e-06, "loss": 66.6322, "reward": 0.6209062337875366, "reward_std": 0.3576282262802124, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.24590624868869781, "step": 477 }, { "completion_length": 151.5, "epoch": 0.25575173889780634, "grad_norm": 0.6741389036178589, "kl": 0.06158649921417236, "learning_rate": 4.260249554367202e-06, "loss": 0.0025, "reward": 1.2160937786102295, "reward_std": 1.1111421585083008, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.2004687488079071, "step": 478 }, { "completion_length": 147.75, "epoch": 0.256286784376672, "grad_norm": 1.5304349660873413, "kl": 0.08597811311483383, "learning_rate": 4.2691622103386815e-06, "loss": 0.0034, "reward": 0.9803749322891235, "reward_std": 0.5794196128845215, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1834999918937683, "step": 479 }, { "completion_length": 173.09375, "epoch": 0.2568218298555377, "grad_norm": 0.8237088918685913, "kl": 0.10933926701545715, "learning_rate": 4.2780748663101604e-06, "loss": 0.0044, "reward": 0.8267187476158142, "reward_std": 0.4703156352043152, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1704687476158142, "step": 480 }, { "completion_length": 124.84375, "epoch": 0.25735687533440343, "grad_norm": 1.0047539472579956, "kl": 0.07276050746440887, "learning_rate": 4.28698752228164e-06, "loss": 0.0029, "reward": 1.5975000858306885, "reward_std": 0.9477010369300842, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2537499964237213, "step": 481 }, { "completion_length": 136.875, "epoch": 0.2578919208132691, "grad_norm": 0.6462523341178894, "kl": 0.04960593581199646, "learning_rate": 4.29590017825312e-06, "loss": 0.002, "reward": 1.4149062633514404, "reward_std": 0.5768589973449707, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.22740624845027924, "step": 482 }, { "completion_length": 119.75, "epoch": 0.25842696629213485, "grad_norm": 167836992.0, "kl": 704929.75, "learning_rate": 4.304812834224599e-06, "loss": 28197.1855, "reward": 1.0832188129425049, "reward_std": 0.7941348552703857, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.2550937533378601, "step": 483 }, { "completion_length": 135.25, "epoch": 0.25896201177100053, "grad_norm": 0.9420516490936279, "kl": 0.0846303403377533, "learning_rate": 4.313725490196079e-06, "loss": 0.0034, "reward": 1.8348125219345093, "reward_std": 0.9825849533081055, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.24106250703334808, "step": 484 }, { "completion_length": 138.84375, "epoch": 0.2594970572498662, "grad_norm": 2962.81201171875, "kl": 13.186210632324219, "learning_rate": 4.322638146167558e-06, "loss": 0.5274, "reward": 1.3088124990463257, "reward_std": 0.7913475632667542, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.23068749904632568, "step": 485 }, { "completion_length": 128.03125, "epoch": 0.26003210272873195, "grad_norm": 2.308253049850464, "kl": 0.1632193922996521, "learning_rate": 4.331550802139038e-06, "loss": 0.0065, "reward": 1.7755625247955322, "reward_std": 1.128631830215454, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.24431249499320984, "step": 486 }, { "completion_length": 168.15625, "epoch": 0.26056714820759763, "grad_norm": 1.648793339729309, "kl": 0.08234929293394089, "learning_rate": 4.340463458110517e-06, "loss": 0.0033, "reward": 0.6635313034057617, "reward_std": 0.5835587978363037, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17915624380111694, "step": 487 }, { "completion_length": 119.90625, "epoch": 0.26110219368646337, "grad_norm": 13.887626647949219, "kl": 0.14279191195964813, "learning_rate": 4.349376114081997e-06, "loss": 0.0057, "reward": 1.2804374694824219, "reward_std": 0.6654492616653442, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.24918749928474426, "step": 488 }, { "completion_length": 150.03125, "epoch": 0.26163723916532905, "grad_norm": 112787.4609375, "kl": 1972.997802734375, "learning_rate": 4.3582887700534766e-06, "loss": 78.9199, "reward": 0.676562488079071, "reward_std": 0.537277102470398, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20781250298023224, "step": 489 }, { "completion_length": 142.65625, "epoch": 0.26217228464419473, "grad_norm": 1.2007683515548706, "kl": 0.11763237416744232, "learning_rate": 4.3672014260249555e-06, "loss": 0.0047, "reward": 1.3863437175750732, "reward_std": 1.1455800533294678, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.23009374737739563, "step": 490 }, { "completion_length": 175.125, "epoch": 0.26270733012306047, "grad_norm": 4.050726890563965, "kl": 0.1400148570537567, "learning_rate": 4.376114081996435e-06, "loss": 0.0056, "reward": 0.8619062900543213, "reward_std": 0.8585641384124756, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1587812602519989, "step": 491 }, { "completion_length": 146.5, "epoch": 0.26324237560192615, "grad_norm": 0.764924168586731, "kl": 0.10396616160869598, "learning_rate": 4.385026737967915e-06, "loss": 0.0042, "reward": 0.5082812309265137, "reward_std": 0.30238473415374756, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21140626072883606, "step": 492 }, { "completion_length": 137.0625, "epoch": 0.2637774210807919, "grad_norm": 0.629497766494751, "kl": 0.0710451602935791, "learning_rate": 4.393939393939394e-06, "loss": 0.0028, "reward": 0.7873125076293945, "reward_std": 0.4740118980407715, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20918750762939453, "step": 493 }, { "completion_length": 142.75, "epoch": 0.26431246655965757, "grad_norm": 1.744695782661438, "kl": 0.0860029011964798, "learning_rate": 4.402852049910874e-06, "loss": 0.0034, "reward": 0.6223437786102295, "reward_std": 0.5836797952651978, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.2004687488079071, "step": 494 }, { "completion_length": 130.96875, "epoch": 0.26484751203852325, "grad_norm": 3.2314391136169434, "kl": 0.10635615140199661, "learning_rate": 4.411764705882353e-06, "loss": 0.0043, "reward": 1.155656337738037, "reward_std": 0.7547322511672974, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21815624833106995, "step": 495 }, { "completion_length": 111.90625, "epoch": 0.265382557517389, "grad_norm": 4.964876174926758, "kl": 0.18096625804901123, "learning_rate": 4.420677361853833e-06, "loss": 0.0072, "reward": 1.2277500629425049, "reward_std": 0.896965742111206, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2746250033378601, "step": 496 }, { "completion_length": 159.5, "epoch": 0.26591760299625467, "grad_norm": 0.9324315190315247, "kl": 0.09219729900360107, "learning_rate": 4.429590017825312e-06, "loss": 0.0037, "reward": 0.843999981880188, "reward_std": 0.7470042705535889, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.203374981880188, "step": 497 }, { "completion_length": 138.375, "epoch": 0.2664526484751204, "grad_norm": 1.5680216550827026, "kl": 0.09005965292453766, "learning_rate": 4.438502673796792e-06, "loss": 0.0036, "reward": 1.6749687194824219, "reward_std": 1.0930265188217163, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.23746876418590546, "step": 498 }, { "completion_length": 148.78125, "epoch": 0.2669876939539861, "grad_norm": 1.0947850942611694, "kl": 0.09766508638858795, "learning_rate": 4.447415329768272e-06, "loss": 0.0039, "reward": 1.3400624990463257, "reward_std": 0.492107093334198, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19943749904632568, "step": 499 }, { "completion_length": 151.21875, "epoch": 0.2675227394328518, "grad_norm": 6.51837158203125, "kl": 0.12783803045749664, "learning_rate": 4.4563279857397505e-06, "loss": 0.0051, "reward": 1.070812463760376, "reward_std": 1.0145443677902222, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18018749356269836, "step": 500 }, { "completion_length": 145.84375, "epoch": 0.2680577849117175, "grad_norm": 7.142597675323486, "kl": 0.28197428584098816, "learning_rate": 4.46524064171123e-06, "loss": 0.0113, "reward": 0.5651875138282776, "reward_std": 0.22979900240898132, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1901874989271164, "step": 501 }, { "completion_length": 119.5625, "epoch": 0.2685928303905832, "grad_norm": 1.1407296657562256, "kl": 0.17354264855384827, "learning_rate": 4.474153297682709e-06, "loss": 0.0069, "reward": 1.2145313024520874, "reward_std": 0.8079571723937988, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.24578124284744263, "step": 502 }, { "completion_length": 149.03125, "epoch": 0.2691278758694489, "grad_norm": 1.0642926692962646, "kl": 0.11349543184041977, "learning_rate": 4.483065953654189e-06, "loss": 0.0045, "reward": 0.8914999961853027, "reward_std": 0.8153427243232727, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18837501108646393, "step": 503 }, { "completion_length": 130.3125, "epoch": 0.2696629213483146, "grad_norm": 1.4571481943130493, "kl": 0.10660744458436966, "learning_rate": 4.491978609625669e-06, "loss": 0.0043, "reward": 1.752906322479248, "reward_std": 0.8428686857223511, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.23728123307228088, "step": 504 }, { "completion_length": 145.4375, "epoch": 0.27019796682718034, "grad_norm": 0.8692178726196289, "kl": 0.10191358625888824, "learning_rate": 4.5008912655971484e-06, "loss": 0.0041, "reward": 0.9299375414848328, "reward_std": 0.8227660059928894, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21118751168251038, "step": 505 }, { "completion_length": 150.53125, "epoch": 0.270733012306046, "grad_norm": 0.908882737159729, "kl": 0.09353134036064148, "learning_rate": 4.509803921568628e-06, "loss": 0.0037, "reward": 1.3336249589920044, "reward_std": 1.0762202739715576, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17737500369548798, "step": 506 }, { "completion_length": 147.34375, "epoch": 0.2712680577849117, "grad_norm": 123.06321716308594, "kl": 0.32731735706329346, "learning_rate": 4.518716577540107e-06, "loss": 0.0131, "reward": 0.690000057220459, "reward_std": 0.5992581844329834, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1900000125169754, "step": 507 }, { "completion_length": 143.78125, "epoch": 0.27180310326377743, "grad_norm": 63038.73046875, "kl": 9431.6923828125, "learning_rate": 4.527629233511587e-06, "loss": 377.2677, "reward": 0.9996249675750732, "reward_std": 0.4353262782096863, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.20274999737739563, "step": 508 }, { "completion_length": 136.65625, "epoch": 0.2723381487426431, "grad_norm": 0.8864173889160156, "kl": 0.11414393782615662, "learning_rate": 4.536541889483067e-06, "loss": 0.0046, "reward": 1.649156093597412, "reward_std": 0.884843111038208, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.24290625751018524, "step": 509 }, { "completion_length": 132.53125, "epoch": 0.27287319422150885, "grad_norm": 1.0240767002105713, "kl": 0.10789159685373306, "learning_rate": 4.5454545454545455e-06, "loss": 0.0043, "reward": 0.9593125581741333, "reward_std": 0.623863697052002, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.22493749856948853, "step": 510 }, { "completion_length": 154.34375, "epoch": 0.27340823970037453, "grad_norm": 0.49868839979171753, "kl": 0.08031927794218063, "learning_rate": 4.554367201426025e-06, "loss": 0.0032, "reward": 0.6019375324249268, "reward_std": 0.41712677478790283, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18006250262260437, "step": 511 }, { "completion_length": 113.25, "epoch": 0.2739432851792402, "grad_norm": 1.3850229978561401, "kl": 0.11357066035270691, "learning_rate": 4.563279857397504e-06, "loss": 0.0045, "reward": 1.5255311727523804, "reward_std": 0.5408128499984741, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25990623235702515, "step": 512 }, { "completion_length": 153.375, "epoch": 0.27447833065810595, "grad_norm": 587615552.0, "kl": 28139560.0, "learning_rate": 4.572192513368984e-06, "loss": 1125582.25, "reward": 0.9935000538825989, "reward_std": 0.7209681868553162, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1966250091791153, "step": 513 }, { "completion_length": 158.875, "epoch": 0.27501337613697163, "grad_norm": 1.1396201848983765, "kl": 0.09973813593387604, "learning_rate": 4.581105169340464e-06, "loss": 0.004, "reward": 1.0029687881469727, "reward_std": 0.7444486618041992, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15921874344348907, "step": 514 }, { "completion_length": 148.59375, "epoch": 0.27554842161583737, "grad_norm": 74831.1015625, "kl": 236.310546875, "learning_rate": 4.5900178253119435e-06, "loss": 9.4524, "reward": 0.8438437581062317, "reward_std": 0.5655967593193054, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2032187432050705, "step": 515 }, { "completion_length": 153.09375, "epoch": 0.27608346709470305, "grad_norm": 3837358080.0, "kl": 229353536.0, "learning_rate": 4.598930481283423e-06, "loss": 9174141.0, "reward": 0.6124062538146973, "reward_std": 0.6717393398284912, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.234375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19053125381469727, "step": 516 }, { "completion_length": 143.28125, "epoch": 0.27661851257356873, "grad_norm": 1155988.0, "kl": 78722.4921875, "learning_rate": 4.607843137254902e-06, "loss": 3148.8999, "reward": 1.0187187194824219, "reward_std": 0.47693273425102234, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20621874928474426, "step": 517 }, { "completion_length": 139.1875, "epoch": 0.27715355805243447, "grad_norm": 2.8398985862731934, "kl": 0.12072663009166718, "learning_rate": 4.616755793226382e-06, "loss": 0.0048, "reward": 1.4481563568115234, "reward_std": 0.9460020661354065, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2137812376022339, "step": 518 }, { "completion_length": 136.71875, "epoch": 0.27768860353130015, "grad_norm": 0.9436740875244141, "kl": 0.0734037309885025, "learning_rate": 4.625668449197862e-06, "loss": 0.0029, "reward": 0.5969687700271606, "reward_std": 0.4488331377506256, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.15625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19071874022483826, "step": 519 }, { "completion_length": 119.09375, "epoch": 0.2782236490101659, "grad_norm": 44036344.0, "kl": 236271.375, "learning_rate": 4.6345811051693405e-06, "loss": 9450.8564, "reward": 1.5672812461853027, "reward_std": 0.9157366752624512, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25478124618530273, "step": 520 }, { "completion_length": 150.71875, "epoch": 0.27875869448903157, "grad_norm": 1.1982167959213257, "kl": 0.10052873939275742, "learning_rate": 4.64349376114082e-06, "loss": 0.004, "reward": 0.7809374928474426, "reward_std": 0.6211971044540405, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18718749284744263, "step": 521 }, { "completion_length": 149.875, "epoch": 0.27929373996789725, "grad_norm": 3936.436767578125, "kl": 11.59850788116455, "learning_rate": 4.6524064171123e-06, "loss": 0.4639, "reward": 0.6907812356948853, "reward_std": 0.5388943552970886, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20640623569488525, "step": 522 }, { "completion_length": 121.9375, "epoch": 0.279828785446763, "grad_norm": 0.9779459834098816, "kl": 0.11734985560178757, "learning_rate": 4.661319073083779e-06, "loss": 0.0047, "reward": 1.1923749446868896, "reward_std": 0.7561180591583252, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2548750042915344, "step": 523 }, { "completion_length": 146.0625, "epoch": 0.28036383092562867, "grad_norm": 30003250.0, "kl": 8263866.5, "learning_rate": 4.670231729055259e-06, "loss": 330554.6562, "reward": 0.960437536239624, "reward_std": 0.6913436651229858, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.22606249153614044, "step": 524 }, { "completion_length": 124.59375, "epoch": 0.2808988764044944, "grad_norm": 159643264.0, "kl": 33707772.0, "learning_rate": 4.6791443850267385e-06, "loss": 1348310.875, "reward": 1.3791249990463257, "reward_std": 0.9904014468193054, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2541249990463257, "step": 525 }, { "completion_length": 146.9375, "epoch": 0.2814339218833601, "grad_norm": 39942328.0, "kl": 189398.375, "learning_rate": 4.688057040998218e-06, "loss": 7575.9355, "reward": 0.6955937147140503, "reward_std": 0.4018939137458801, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.22684374451637268, "step": 526 }, { "completion_length": 146.125, "epoch": 0.28196896736222576, "grad_norm": 2.397361993789673, "kl": 0.14354875683784485, "learning_rate": 4.696969696969698e-06, "loss": 0.0057, "reward": 0.5948437452316284, "reward_std": 0.44630271196365356, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20421874523162842, "step": 527 }, { "completion_length": 130.21875, "epoch": 0.2825040128410915, "grad_norm": 2.1718695163726807, "kl": 0.1498590111732483, "learning_rate": 4.705882352941177e-06, "loss": 0.006, "reward": 1.4239686727523804, "reward_std": 0.7673525810241699, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.23646874725818634, "step": 528 }, { "completion_length": 149.96875, "epoch": 0.2830390583199572, "grad_norm": 522102.4375, "kl": 3482.64208984375, "learning_rate": 4.714795008912657e-06, "loss": 139.3057, "reward": 0.6405313014984131, "reward_std": 0.5928102731704712, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2342812418937683, "step": 529 }, { "completion_length": 147.28125, "epoch": 0.2835741037988229, "grad_norm": 1.4382996559143066, "kl": 0.16951152682304382, "learning_rate": 4.7237076648841356e-06, "loss": 0.0068, "reward": 1.54812490940094, "reward_std": 0.8845933675765991, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2043749988079071, "step": 530 }, { "completion_length": 120.78125, "epoch": 0.2841091492776886, "grad_norm": 130966856.0, "kl": 828633.1875, "learning_rate": 4.732620320855615e-06, "loss": 33145.3281, "reward": 1.538562536239624, "reward_std": 0.8590182065963745, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.24168750643730164, "step": 531 }, { "completion_length": 146.34375, "epoch": 0.2846441947565543, "grad_norm": 1261360128.0, "kl": 49550176.0, "learning_rate": 4.741532976827095e-06, "loss": 1982007.125, "reward": 1.0437812805175781, "reward_std": 0.8839014768600464, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.23128125071525574, "step": 532 }, { "completion_length": 161.0, "epoch": 0.28517924023542, "grad_norm": 1.573358178138733, "kl": 0.12315855175256729, "learning_rate": 4.750445632798574e-06, "loss": 0.0049, "reward": 1.2208750247955322, "reward_std": 0.526652991771698, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.22087499499320984, "step": 533 }, { "completion_length": 138.6875, "epoch": 0.2857142857142857, "grad_norm": 1.7480285167694092, "kl": 0.11410924792289734, "learning_rate": 4.759358288770054e-06, "loss": 0.0046, "reward": 1.3358125686645508, "reward_std": 0.8646221160888672, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.242062509059906, "step": 534 }, { "completion_length": 134.84375, "epoch": 0.28624933119315144, "grad_norm": 0.9093268513679504, "kl": 0.1199382096529007, "learning_rate": 4.7682709447415335e-06, "loss": 0.0048, "reward": 1.3056249618530273, "reward_std": 0.4171932637691498, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21187499165534973, "step": 535 }, { "completion_length": 134.0, "epoch": 0.2867843766720171, "grad_norm": 0.8454050421714783, "kl": 0.09768184274435043, "learning_rate": 4.777183600713013e-06, "loss": 0.0039, "reward": 1.4234375953674316, "reward_std": 0.5097995400428772, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.23593750596046448, "step": 536 }, { "completion_length": 152.71875, "epoch": 0.28731942215088285, "grad_norm": 2.3247342109680176, "kl": 0.18546807765960693, "learning_rate": 4.786096256684493e-06, "loss": 0.0074, "reward": 0.9327499866485596, "reward_std": 0.5947510004043579, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.21400001645088196, "step": 537 }, { "completion_length": 158.5625, "epoch": 0.28785446762974853, "grad_norm": 721.96875, "kl": 4.298344612121582, "learning_rate": 4.795008912655972e-06, "loss": 0.1719, "reward": 0.6653437614440918, "reward_std": 0.4345054626464844, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.1965937465429306, "step": 538 }, { "completion_length": 120.34375, "epoch": 0.2883895131086142, "grad_norm": 300.0639343261719, "kl": 24.301311492919922, "learning_rate": 4.803921568627452e-06, "loss": 0.9721, "reward": 1.301031231880188, "reward_std": 0.9462289810180664, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20728124678134918, "step": 539 }, { "completion_length": 150.6875, "epoch": 0.28892455858747995, "grad_norm": 1.959051489830017, "kl": 0.09626677632331848, "learning_rate": 4.812834224598931e-06, "loss": 0.0039, "reward": 0.7400624752044678, "reward_std": 0.6224116086959839, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.20881250500679016, "step": 540 }, { "completion_length": 162.1875, "epoch": 0.28945960406634563, "grad_norm": 118640.9140625, "kl": 14568.23046875, "learning_rate": 4.82174688057041e-06, "loss": 582.7292, "reward": 0.472968727350235, "reward_std": 0.358320951461792, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19171875715255737, "step": 541 }, { "completion_length": 134.40625, "epoch": 0.28999464954521137, "grad_norm": 0.7405645847320557, "kl": 0.11762446165084839, "learning_rate": 4.83065953654189e-06, "loss": 0.0047, "reward": 1.19712495803833, "reward_std": 0.95395827293396, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.24400000274181366, "step": 542 }, { "completion_length": 159.4375, "epoch": 0.29052969502407705, "grad_norm": 1.7090049982070923, "kl": 0.1374407410621643, "learning_rate": 4.839572192513369e-06, "loss": 0.0055, "reward": 0.9870624542236328, "reward_std": 0.7159587144851685, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.2058124989271164, "step": 543 }, { "completion_length": 130.1875, "epoch": 0.29106474050294273, "grad_norm": 1.0052461624145508, "kl": 0.14143522083759308, "learning_rate": 4.848484848484849e-06, "loss": 0.0057, "reward": 0.8988437652587891, "reward_std": 0.8252813816070557, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.24259376525878906, "step": 544 }, { "completion_length": 151.96875, "epoch": 0.29159978598180847, "grad_norm": 1.3940376043319702, "kl": 0.10795305669307709, "learning_rate": 4.8573975044563285e-06, "loss": 0.0043, "reward": 0.9083437919616699, "reward_std": 0.8266648054122925, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18959374725818634, "step": 545 }, { "completion_length": 160.09375, "epoch": 0.29213483146067415, "grad_norm": 0.7986281514167786, "kl": 0.13126054406166077, "learning_rate": 4.866310160427808e-06, "loss": 0.0053, "reward": 0.8323125243186951, "reward_std": 0.8089392781257629, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19168749451637268, "step": 546 }, { "completion_length": 158.5625, "epoch": 0.2926698769395399, "grad_norm": 1.041006326675415, "kl": 0.08445398509502411, "learning_rate": 4.875222816399288e-06, "loss": 0.0034, "reward": 0.7235937714576721, "reward_std": 0.6211674809455872, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.19234375655651093, "step": 547 }, { "completion_length": 146.09375, "epoch": 0.29320492241840557, "grad_norm": 278.48681640625, "kl": 1.1632955074310303, "learning_rate": 4.884135472370767e-06, "loss": 0.0465, "reward": 1.2232812643051147, "reward_std": 1.0535871982574463, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.25453126430511475, "step": 548 }, { "completion_length": 128.9375, "epoch": 0.29373996789727125, "grad_norm": 1.8061426877975464, "kl": 0.12733566761016846, "learning_rate": 4.893048128342247e-06, "loss": 0.0051, "reward": 1.3171563148498535, "reward_std": 1.0938620567321777, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": 0.25465625524520874, "step": 549 }, { "completion_length": 129.28125, "epoch": 0.294275013376137, "grad_norm": 1.020645022392273, "kl": 0.15258407592773438, "learning_rate": 4.901960784313726e-06, "loss": 0.0061, "reward": 1.257406234741211, "reward_std": 0.5679216384887695, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.24178123474121094, "step": 550 }, { "completion_length": 126.28125, "epoch": 0.29481005885500267, "grad_norm": 62.806148529052734, "kl": 0.7245900630950928, "learning_rate": 4.910873440285205e-06, "loss": 0.029, "reward": 1.3998124599456787, "reward_std": 0.8643145561218262, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2748124897480011, "step": 551 }, { "completion_length": 138.84375, "epoch": 0.2953451043338684, "grad_norm": 1.4102038145065308, "kl": 0.14593416452407837, "learning_rate": 4.919786096256685e-06, "loss": 0.0058, "reward": 1.1729062795639038, "reward_std": 0.9741991758346558, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2822812497615814, "step": 552 }, { "completion_length": 146.59375, "epoch": 0.2958801498127341, "grad_norm": 1.0834661722183228, "kl": 0.12066717445850372, "learning_rate": 4.928698752228164e-06, "loss": 0.0048, "reward": 0.6286250352859497, "reward_std": 0.502582848072052, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.20675000548362732, "step": 553 }, { "completion_length": 115.625, "epoch": 0.29641519529159976, "grad_norm": 1.5594919919967651, "kl": 0.2213023155927658, "learning_rate": 4.937611408199644e-06, "loss": 0.0089, "reward": 1.2661874294281006, "reward_std": 0.8313131928443909, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.29743748903274536, "step": 554 }, { "completion_length": 135.75, "epoch": 0.2969502407704655, "grad_norm": 0.9426469802856445, "kl": 0.12891408801078796, "learning_rate": 4.9465240641711236e-06, "loss": 0.0052, "reward": 1.247937560081482, "reward_std": 0.9441639184951782, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.046875, "rewards/xmlcount_reward_func": 0.27918750047683716, "step": 555 }, { "completion_length": 140.09375, "epoch": 0.2974852862493312, "grad_norm": 3.7242825031280518, "kl": 0.12330898642539978, "learning_rate": 4.955436720142603e-06, "loss": 0.0049, "reward": 1.8617501258850098, "reward_std": 0.625201404094696, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": 0.2523750066757202, "step": 556 }, { "completion_length": 137.71875, "epoch": 0.2980203317281969, "grad_norm": 0.7415712475776672, "kl": 0.11861289292573929, "learning_rate": 4.964349376114082e-06, "loss": 0.0047, "reward": 1.3356249332427979, "reward_std": 0.9262749552726746, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": 0.24187499284744263, "step": 557 }, { "completion_length": 151.875, "epoch": 0.2985553772070626, "grad_norm": 9.313074111938477, "kl": 0.13014036417007446, "learning_rate": 4.973262032085562e-06, "loss": 0.0052, "reward": 1.0577812194824219, "reward_std": 0.6158734560012817, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.24528126418590546, "step": 558 }, { "completion_length": 154.59375, "epoch": 0.2990904226859283, "grad_norm": 202575104.0, "kl": 319255.375, "learning_rate": 4.982174688057042e-06, "loss": 12770.2188, "reward": 0.9648749828338623, "reward_std": 0.623198926448822, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.2304999828338623, "step": 559 }, { "completion_length": 119.8125, "epoch": 0.299625468164794, "grad_norm": 0.8201929926872253, "kl": 0.15358443558216095, "learning_rate": 4.991087344028521e-06, "loss": 0.0061, "reward": 1.3776874542236328, "reward_std": 0.8942487239837646, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2995624840259552, "step": 560 }, { "completion_length": 139.78125, "epoch": 0.3001605136436597, "grad_norm": 4651622.5, "kl": 201783.265625, "learning_rate": 5e-06, "loss": 8071.3311, "reward": 0.8089061975479126, "reward_std": 0.5390723943710327, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.2620312571525574, "step": 561 }, { "completion_length": 144.1875, "epoch": 0.30069555912252544, "grad_norm": 0.7900323271751404, "kl": 0.11412939429283142, "learning_rate": 4.999999515476047e-06, "loss": 0.0046, "reward": 1.1375937461853027, "reward_std": 0.645972490310669, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.23134374618530273, "step": 562 }, { "completion_length": 131.625, "epoch": 0.3012306046013911, "grad_norm": 1.8038222789764404, "kl": 0.14183346927165985, "learning_rate": 4.9999980619043754e-06, "loss": 0.0057, "reward": 1.4622187614440918, "reward_std": 0.9409651160240173, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.3059687614440918, "step": 563 }, { "completion_length": 159.75, "epoch": 0.3017656500802568, "grad_norm": 2.478877067565918, "kl": 0.15499812364578247, "learning_rate": 4.999995639285548e-06, "loss": 0.0062, "reward": 0.6973750591278076, "reward_std": 0.6075812578201294, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.22862499952316284, "step": 564 }, { "completion_length": 111.6875, "epoch": 0.30230069555912253, "grad_norm": 1.9595016241073608, "kl": 0.17489895224571228, "learning_rate": 4.999992247620504e-06, "loss": 0.007, "reward": 1.4695625305175781, "reward_std": 0.5608975291252136, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.28206250071525574, "step": 565 }, { "completion_length": 141.25, "epoch": 0.3028357410379882, "grad_norm": 2.4282174110412598, "kl": 0.14755606651306152, "learning_rate": 4.999987886910559e-06, "loss": 0.0059, "reward": 1.1509062051773071, "reward_std": 0.7003718018531799, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.2602812647819519, "step": 566 }, { "completion_length": 115.25, "epoch": 0.30337078651685395, "grad_norm": 1.4557820558547974, "kl": 0.18227989971637726, "learning_rate": 4.999982557157403e-06, "loss": 0.0073, "reward": 1.1104687452316284, "reward_std": 0.4004429578781128, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": 0.2979687750339508, "step": 567 }, { "completion_length": 133.40625, "epoch": 0.30390583199571963, "grad_norm": 7.356126308441162, "kl": 0.2144286036491394, "learning_rate": 4.9999762583631015e-06, "loss": 0.0086, "reward": 1.3688750267028809, "reward_std": 1.1600940227508545, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25949999690055847, "step": 568 }, { "completion_length": 124.53125, "epoch": 0.3044408774745853, "grad_norm": 5.8234052658081055, "kl": 0.36251595616340637, "learning_rate": 4.999968990530096e-06, "loss": 0.0145, "reward": 1.3153438568115234, "reward_std": 0.638300359249115, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": 0.3153437376022339, "step": 569 }, { "completion_length": 124.40625, "epoch": 0.30497592295345105, "grad_norm": 2.0799901485443115, "kl": 0.17228195071220398, "learning_rate": 4.999960753661204e-06, "loss": 0.0069, "reward": 1.7418437004089355, "reward_std": 0.682796835899353, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": 0.2887187600135803, "step": 570 }, { "completion_length": 148.59375, "epoch": 0.30551096843231673, "grad_norm": 0.817365825176239, "kl": 0.24158884584903717, "learning_rate": 4.9999515477596175e-06, "loss": 0.0097, "reward": 0.8980624675750732, "reward_std": 0.9891007542610168, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.234375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": 0.25743749737739563, "step": 571 }, { "completion_length": 139.1875, "epoch": 0.30604601391118247, "grad_norm": 120.33000183105469, "kl": 0.4513019323348999, "learning_rate": 4.9999413728289055e-06, "loss": 0.0181, "reward": 1.2131874561309814, "reward_std": 0.9499621987342834, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": 0.2913125157356262, "step": 572 }, { "completion_length": 117.125, "epoch": 0.30658105939004815, "grad_norm": 16.670534133911133, "kl": 0.21514803171157837, "learning_rate": 4.9999302288730124e-06, "loss": 0.0086, "reward": 1.0357187986373901, "reward_std": 0.7572939395904541, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": 0.33259376883506775, "step": 573 }, { "completion_length": 139.90625, "epoch": 0.30711610486891383, "grad_norm": 37220.01953125, "kl": 272.1947326660156, "learning_rate": 4.999918115896257e-06, "loss": 10.8878, "reward": 1.399343729019165, "reward_std": 0.7995388507843018, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.046875, "rewards/xmlcount_reward_func": 0.3055937588214874, "step": 574 }, { "completion_length": 139.5, "epoch": 0.30765115034777957, "grad_norm": 0.7997912764549255, "kl": 0.17238499224185944, "learning_rate": 4.999905033903336e-06, "loss": 0.0069, "reward": 1.4453749656677246, "reward_std": 0.9732170104980469, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.046875, "rewards/xmlcount_reward_func": 0.273499995470047, "step": 575 }, { "completion_length": 117.65625, "epoch": 0.30818619582664525, "grad_norm": 3.768213987350464, "kl": 0.15683715045452118, "learning_rate": 4.9998909828993185e-06, "loss": 0.0063, "reward": 1.0402812957763672, "reward_std": 0.9295781254768372, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.046875, "rewards/xmlcount_reward_func": 0.2902812361717224, "step": 576 }, { "completion_length": 161.03125, "epoch": 0.308721241305511, "grad_norm": 0.7117462754249573, "kl": 0.13009029626846313, "learning_rate": 4.9998759628896514e-06, "loss": 0.0052, "reward": 1.1629687547683716, "reward_std": 0.9532075524330139, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.046875, "rewards/xmlcount_reward_func": 0.2410937398672104, "step": 577 }, { "completion_length": 129.75, "epoch": 0.30925628678437667, "grad_norm": 1496.8067626953125, "kl": 1.7905502319335938, "learning_rate": 4.999859973880157e-06, "loss": 0.0716, "reward": 1.7025312185287476, "reward_std": 0.6074333786964417, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": 0.31190624833106995, "step": 578 }, { "completion_length": 117.28125, "epoch": 0.3097913322632424, "grad_norm": 0.852471113204956, "kl": 0.12911485135555267, "learning_rate": 4.999843015877033e-06, "loss": 0.0052, "reward": 1.7140936851501465, "reward_std": 0.6834829449653625, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29221874475479126, "step": 579 }, { "completion_length": 140.875, "epoch": 0.3103263777421081, "grad_norm": 3.0198729038238525, "kl": 0.2327173352241516, "learning_rate": 4.999825088886852e-06, "loss": 0.0093, "reward": 1.2852811813354492, "reward_std": 0.7766664028167725, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.046875, "rewards/xmlcount_reward_func": 0.3634062707424164, "step": 580 }, { "completion_length": 116.0625, "epoch": 0.31086142322097376, "grad_norm": 1.1708544492721558, "kl": 0.17702460289001465, "learning_rate": 4.999806192916565e-06, "loss": 0.0071, "reward": 1.491687536239624, "reward_std": 0.8915790915489197, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36668747663497925, "step": 581 }, { "completion_length": 127.53125, "epoch": 0.3113964686998395, "grad_norm": 0.9972829818725586, "kl": 0.13652187585830688, "learning_rate": 4.9997863279734925e-06, "loss": 0.0055, "reward": 1.3057498931884766, "reward_std": 0.8143401145935059, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.046875, "rewards/xmlcount_reward_func": 0.3370000123977661, "step": 582 }, { "completion_length": 152.9375, "epoch": 0.3119315141787052, "grad_norm": 1.4997421503067017, "kl": 0.1079225242137909, "learning_rate": 4.999765494065339e-06, "loss": 0.0043, "reward": 0.9188125133514404, "reward_std": 0.6988115906715393, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.26256248354911804, "step": 583 }, { "completion_length": 95.5625, "epoch": 0.3124665596575709, "grad_norm": 373950.375, "kl": 335.281005859375, "learning_rate": 4.999743691200176e-06, "loss": 13.4112, "reward": 2.34403133392334, "reward_std": 1.158963918685913, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.125, "rewards/xmlcount_reward_func": 0.4221562445163727, "step": 584 }, { "completion_length": 133.40625, "epoch": 0.3130016051364366, "grad_norm": 1.2995703220367432, "kl": 0.13279804587364197, "learning_rate": 4.999720919386458e-06, "loss": 0.0053, "reward": 0.9924062490463257, "reward_std": 0.613679051399231, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": 0.3517812490463257, "step": 585 }, { "completion_length": 133.90625, "epoch": 0.3135366506153023, "grad_norm": 2.4040474891662598, "kl": 0.1998138725757599, "learning_rate": 4.999697178633009e-06, "loss": 0.008, "reward": 1.517468810081482, "reward_std": 0.4891775846481323, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.32996875047683716, "step": 586 }, { "completion_length": 143.21875, "epoch": 0.314071696094168, "grad_norm": 1.2014886140823364, "kl": 0.16049404442310333, "learning_rate": 4.999672468949035e-06, "loss": 0.0064, "reward": 0.9267187118530273, "reward_std": 0.838442862033844, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0625, "rewards/xmlcount_reward_func": 0.31734374165534973, "step": 587 }, { "completion_length": 120.25, "epoch": 0.3146067415730337, "grad_norm": 0.9917144179344177, "kl": 0.1905466765165329, "learning_rate": 4.999646790344111e-06, "loss": 0.0076, "reward": 1.5671563148498535, "reward_std": 0.9768866896629333, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.36403125524520874, "step": 588 }, { "completion_length": 117.90625, "epoch": 0.31514178705189944, "grad_norm": 22000.19140625, "kl": 113.00740051269531, "learning_rate": 4.99962014282819e-06, "loss": 4.5203, "reward": 2.0957188606262207, "reward_std": 1.0873019695281982, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.109375, "rewards/xmlcount_reward_func": 0.37696874141693115, "step": 589 }, { "completion_length": 136.03125, "epoch": 0.3156768325307651, "grad_norm": 1.3982230424880981, "kl": 0.2074296474456787, "learning_rate": 4.9995925264116035e-06, "loss": 0.0083, "reward": 1.0325312614440918, "reward_std": 0.6952790021896362, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.3450312614440918, "step": 590 }, { "completion_length": 122.3125, "epoch": 0.3162118780096308, "grad_norm": 2.674365520477295, "kl": 0.5404149293899536, "learning_rate": 4.999563941105055e-06, "loss": 0.0216, "reward": 1.581781268119812, "reward_std": 0.5713056921958923, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.316156268119812, "step": 591 }, { "completion_length": 135.125, "epoch": 0.31674692348849653, "grad_norm": 411966.75, "kl": 6289.87646484375, "learning_rate": 4.999534386919625e-06, "loss": 251.5951, "reward": 1.5433437824249268, "reward_std": 0.46109461784362793, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.09375, "rewards/xmlcount_reward_func": 0.387093722820282, "step": 592 }, { "completion_length": 131.3125, "epoch": 0.3172819689673622, "grad_norm": 122.46676635742188, "kl": 0.30829301476478577, "learning_rate": 4.999503863866768e-06, "loss": 0.0123, "reward": 1.069406270980835, "reward_std": 0.7192955017089844, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.15625, "rewards/xmlcount_reward_func": 0.3819062411785126, "step": 593 }, { "completion_length": 149.5625, "epoch": 0.31781701444622795, "grad_norm": 1.6569898128509521, "kl": 0.15795278549194336, "learning_rate": 4.999472371958317e-06, "loss": 0.0063, "reward": 0.9234374761581421, "reward_std": 0.6383514404296875, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.203125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.3296874761581421, "step": 594 }, { "completion_length": 144.0, "epoch": 0.31835205992509363, "grad_norm": 1.0936322212219238, "kl": 0.18437254428863525, "learning_rate": 4.999439911206478e-06, "loss": 0.0074, "reward": 0.8932499885559082, "reward_std": 0.47531843185424805, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.046875, "rewards/xmlcount_reward_func": 0.3619999885559082, "step": 595 }, { "completion_length": 147.46875, "epoch": 0.3188871054039593, "grad_norm": 0.9163367748260498, "kl": 0.10937200486660004, "learning_rate": 4.999406481623834e-06, "loss": 0.0044, "reward": 1.6533437967300415, "reward_std": 1.0837866067886353, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.34084373712539673, "step": 596 }, { "completion_length": 135.90625, "epoch": 0.31942215088282505, "grad_norm": 0.6767679452896118, "kl": 0.12629003822803497, "learning_rate": 4.999372083223343e-06, "loss": 0.0051, "reward": 1.2588436603546143, "reward_std": 0.6404870748519897, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.046875, "rewards/xmlcount_reward_func": 0.3057187497615814, "step": 597 }, { "completion_length": 145.21875, "epoch": 0.31995719636169073, "grad_norm": 791496192.0, "kl": 1092088.25, "learning_rate": 4.999336716018337e-06, "loss": 43683.5312, "reward": 1.0287188291549683, "reward_std": 0.6211959719657898, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.09375, "rewards/xmlcount_reward_func": 0.3880937397480011, "step": 598 }, { "completion_length": 138.59375, "epoch": 0.32049224184055647, "grad_norm": 0.7096196413040161, "kl": 0.1272144764661789, "learning_rate": 4.999300380022526e-06, "loss": 0.0051, "reward": 1.5629374980926514, "reward_std": 0.7272557616233826, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.09375, "rewards/xmlcount_reward_func": 0.31293749809265137, "step": 599 }, { "completion_length": 116.1875, "epoch": 0.32102728731942215, "grad_norm": 1.8224228620529175, "kl": 0.19250953197479248, "learning_rate": 4.999263075249995e-06, "loss": 0.0077, "reward": 1.8589062690734863, "reward_std": 0.8889896869659424, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.39015626907348633, "step": 600 }, { "completion_length": 159.8125, "epoch": 0.32156233279828783, "grad_norm": 25.390575408935547, "kl": 0.3181711435317993, "learning_rate": 4.999224801715204e-06, "loss": 0.0127, "reward": 0.7849375009536743, "reward_std": 0.6093041896820068, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.046875, "rewards/xmlcount_reward_func": 0.3318125009536743, "step": 601 }, { "completion_length": 151.59375, "epoch": 0.32209737827715357, "grad_norm": 39.788414001464844, "kl": 2.45922589302063, "learning_rate": 4.999185559432987e-06, "loss": 0.0984, "reward": 0.9503124952316284, "reward_std": 0.8585848808288574, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0625, "rewards/xmlcount_reward_func": 0.3409374952316284, "step": 602 }, { "completion_length": 137.84375, "epoch": 0.32263242375601925, "grad_norm": 2.1193437576293945, "kl": 0.14326760172843933, "learning_rate": 4.999145348418558e-06, "loss": 0.0057, "reward": 1.6260312795639038, "reward_std": 1.069470763206482, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0625, "rewards/xmlcount_reward_func": 0.37603121995925903, "step": 603 }, { "completion_length": 143.28125, "epoch": 0.323167469234885, "grad_norm": 3.314002752304077, "kl": 0.1536460518836975, "learning_rate": 4.999104168687501e-06, "loss": 0.0061, "reward": 1.0201563835144043, "reward_std": 0.7691357135772705, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.046875, "rewards/xmlcount_reward_func": 0.42640623450279236, "step": 604 }, { "completion_length": 141.03125, "epoch": 0.32370251471375067, "grad_norm": 1.141831874847412, "kl": 0.17006751894950867, "learning_rate": 4.9990620202557785e-06, "loss": 0.0068, "reward": 1.3425936698913574, "reward_std": 0.49343711137771606, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.078125, "rewards/xmlcount_reward_func": 0.3894687294960022, "step": 605 }, { "completion_length": 137.90625, "epoch": 0.32423756019261635, "grad_norm": 0.5504063963890076, "kl": 0.09850157797336578, "learning_rate": 4.999018903139729e-06, "loss": 0.0039, "reward": 1.793562412261963, "reward_std": 0.6537663340568542, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.09375, "rewards/xmlcount_reward_func": 0.43418750166893005, "step": 606 }, { "completion_length": 127.8125, "epoch": 0.3247726056714821, "grad_norm": 1.458595633506775, "kl": 0.1324320137500763, "learning_rate": 4.998974817356063e-06, "loss": 0.0053, "reward": 1.7363438606262207, "reward_std": 0.8356847167015076, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.078125, "rewards/xmlcount_reward_func": 0.39259374141693115, "step": 607 }, { "completion_length": 129.03125, "epoch": 0.32530765115034777, "grad_norm": 1.7544200420379639, "kl": 0.1970241814851761, "learning_rate": 4.998929762921873e-06, "loss": 0.0079, "reward": 1.5886874198913574, "reward_std": 1.0122904777526855, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.140625, "rewards/xmlcount_reward_func": 0.4480625092983246, "step": 608 }, { "completion_length": 144.5, "epoch": 0.3258426966292135, "grad_norm": 7.454137802124023, "kl": 0.19874060153961182, "learning_rate": 4.998883739854621e-06, "loss": 0.0079, "reward": 0.855343759059906, "reward_std": 0.5249407291412354, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0625, "rewards/xmlcount_reward_func": 0.370968759059906, "step": 609 }, { "completion_length": 127.46875, "epoch": 0.3263777421080792, "grad_norm": 2.862013578414917, "kl": 0.27929067611694336, "learning_rate": 4.998836748172145e-06, "loss": 0.0112, "reward": 1.8349688053131104, "reward_std": 0.9041856527328491, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": 0.3974687457084656, "step": 610 }, { "completion_length": 136.1875, "epoch": 0.32691278758694486, "grad_norm": 4.344705581665039, "kl": 0.19361385703086853, "learning_rate": 4.998788787892662e-06, "loss": 0.0077, "reward": 1.1730625629425049, "reward_std": 0.6775665283203125, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.1875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0625, "rewards/xmlcount_reward_func": 0.4230625033378601, "step": 611 }, { "completion_length": 133.625, "epoch": 0.3274478330658106, "grad_norm": 5.660914897918701, "kl": 0.6091465353965759, "learning_rate": 4.998739859034761e-06, "loss": 0.0244, "reward": 1.536468744277954, "reward_std": 0.745502233505249, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.09375, "rewards/xmlcount_reward_func": 0.4270937442779541, "step": 612 }, { "completion_length": 119.125, "epoch": 0.3279828785446763, "grad_norm": 3.0977745056152344, "kl": 0.42911285161972046, "learning_rate": 4.998689961617409e-06, "loss": 0.0172, "reward": 2.182374954223633, "reward_std": 1.1025352478027344, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.109375, "rewards/xmlcount_reward_func": 0.4792499840259552, "step": 613 }, { "completion_length": 150.5625, "epoch": 0.328517924023542, "grad_norm": 0.6836509704589844, "kl": 0.134546160697937, "learning_rate": 4.998639095659945e-06, "loss": 0.0054, "reward": 1.0809061527252197, "reward_std": 0.9838663935661316, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.046875, "rewards/xmlcount_reward_func": 0.3934062719345093, "step": 614 }, { "completion_length": 128.9375, "epoch": 0.3290529695024077, "grad_norm": 1.0381243228912354, "kl": 0.18597553670406342, "learning_rate": 4.9985872611820885e-06, "loss": 0.0074, "reward": 1.8084375858306885, "reward_std": 1.1493473052978516, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.15625, "rewards/xmlcount_reward_func": 0.4334375262260437, "step": 615 }, { "completion_length": 147.1875, "epoch": 0.3295880149812734, "grad_norm": 0.8365876078605652, "kl": 0.21423737704753876, "learning_rate": 4.99853445820393e-06, "loss": 0.0086, "reward": 1.2317500114440918, "reward_std": 0.5946328043937683, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.046875, "rewards/xmlcount_reward_func": 0.4192500114440918, "step": 616 }, { "completion_length": 147.03125, "epoch": 0.3301230604601391, "grad_norm": 3.081584930419922, "kl": 0.16491663455963135, "learning_rate": 4.998480686745936e-06, "loss": 0.0066, "reward": 1.3966875076293945, "reward_std": 0.8478909134864807, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.109375, "rewards/xmlcount_reward_func": 0.38106250762939453, "step": 617 }, { "completion_length": 125.0625, "epoch": 0.3306581059390048, "grad_norm": 4.451241493225098, "kl": 0.178180992603302, "learning_rate": 4.998425946828951e-06, "loss": 0.0071, "reward": 1.5972813367843628, "reward_std": 0.7990290522575378, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.09375, "rewards/xmlcount_reward_func": 0.472281277179718, "step": 618 }, { "completion_length": 134.90625, "epoch": 0.33119315141787053, "grad_norm": 2.644062042236328, "kl": 0.4177224338054657, "learning_rate": 4.998370238474193e-06, "loss": 0.0167, "reward": 1.8522498607635498, "reward_std": 0.43810170888900757, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0625, "rewards/xmlcount_reward_func": 0.41474997997283936, "step": 619 }, { "completion_length": 149.09375, "epoch": 0.3317281968967362, "grad_norm": 1.4066658020019531, "kl": 0.15488359332084656, "learning_rate": 4.998313561703255e-06, "loss": 0.0062, "reward": 1.124843716621399, "reward_std": 0.8706416487693787, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.078125, "rewards/xmlcount_reward_func": 0.3435937762260437, "step": 620 }, { "completion_length": 156.21875, "epoch": 0.33226324237560195, "grad_norm": 4.993153095245361, "kl": 0.1736719310283661, "learning_rate": 4.998255916538106e-06, "loss": 0.0069, "reward": 1.6379687786102295, "reward_std": 0.791119396686554, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": 0.3567187786102295, "step": 621 }, { "completion_length": 135.5, "epoch": 0.33279828785446763, "grad_norm": 1.3882992267608643, "kl": 0.1762954294681549, "learning_rate": 4.998197303001091e-06, "loss": 0.0071, "reward": 1.427375078201294, "reward_std": 0.7312619686126709, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.140625, "rewards/xmlcount_reward_func": 0.44300001859664917, "step": 622 }, { "completion_length": 113.3125, "epoch": 0.3333333333333333, "grad_norm": 0.8912449479103088, "kl": 0.218331441283226, "learning_rate": 4.998137721114929e-06, "loss": 0.0087, "reward": 2.061718702316284, "reward_std": 0.9467214941978455, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.140625, "rewards/xmlcount_reward_func": 0.46796876192092896, "step": 623 }, { "completion_length": 134.71875, "epoch": 0.33386837881219905, "grad_norm": 11.016603469848633, "kl": 0.6588174104690552, "learning_rate": 4.998077170902716e-06, "loss": 0.0264, "reward": 1.4995625019073486, "reward_std": 0.7851547002792358, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.09375, "rewards/xmlcount_reward_func": 0.35893750190734863, "step": 624 }, { "completion_length": 139.25, "epoch": 0.33440342429106473, "grad_norm": 1.178004264831543, "kl": 0.11515545845031738, "learning_rate": 4.998015652387921e-06, "loss": 0.0046, "reward": 1.2379062175750732, "reward_std": 0.7910526990890503, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.394156277179718, "step": 625 }, { "completion_length": 131.0625, "epoch": 0.33493846976993047, "grad_norm": 1.7204867601394653, "kl": 0.1615171730518341, "learning_rate": 4.9979531655943915e-06, "loss": 0.0065, "reward": 1.7489063739776611, "reward_std": 0.7681006193161011, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.15625, "rewards/xmlcount_reward_func": 0.4520312547683716, "step": 626 }, { "completion_length": 147.90625, "epoch": 0.33547351524879615, "grad_norm": 1410.8048095703125, "kl": 2.4036865234375, "learning_rate": 4.9978897105463475e-06, "loss": 0.0961, "reward": 1.005968689918518, "reward_std": 0.7152736186981201, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.09375, "rewards/xmlcount_reward_func": 0.39659374952316284, "step": 627 }, { "completion_length": 138.09375, "epoch": 0.33600856072766183, "grad_norm": 2.6345465183258057, "kl": 0.42332959175109863, "learning_rate": 4.9978252872683855e-06, "loss": 0.0169, "reward": 0.9794062376022339, "reward_std": 0.5143468976020813, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.125, "rewards/xmlcount_reward_func": 0.4012812376022339, "step": 628 }, { "completion_length": 142.40625, "epoch": 0.33654360620652757, "grad_norm": 1.1392912864685059, "kl": 0.14564648270606995, "learning_rate": 4.997759895785477e-06, "loss": 0.0058, "reward": 0.7856875061988831, "reward_std": 0.3402634263038635, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0625, "rewards/xmlcount_reward_func": 0.42631250619888306, "step": 629 }, { "completion_length": 151.75, "epoch": 0.33707865168539325, "grad_norm": 1.2571276426315308, "kl": 0.11973158270120621, "learning_rate": 4.9976935361229696e-06, "loss": 0.0048, "reward": 1.1762499809265137, "reward_std": 0.7730975151062012, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0625, "rewards/xmlcount_reward_func": 0.37937501072883606, "step": 630 }, { "completion_length": 111.875, "epoch": 0.337613697164259, "grad_norm": 18031.169921875, "kl": 22.803930282592773, "learning_rate": 4.997626208306585e-06, "loss": 0.9122, "reward": 1.4416875839233398, "reward_std": 0.7366894483566284, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.09375, "rewards/xmlcount_reward_func": 0.4573124945163727, "step": 631 }, { "completion_length": 126.03125, "epoch": 0.33814874264312467, "grad_norm": 2626155008.0, "kl": 3299631.25, "learning_rate": 4.997557912362421e-06, "loss": 131985.2656, "reward": 1.7250312566757202, "reward_std": 0.7625637054443359, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.140625, "rewards/xmlcount_reward_func": 0.4594062566757202, "step": 632 }, { "completion_length": 157.65625, "epoch": 0.33868378812199035, "grad_norm": 1.1936869621276855, "kl": 0.15167662501335144, "learning_rate": 4.997488648316951e-06, "loss": 0.0061, "reward": 1.3039374351501465, "reward_std": 1.070563793182373, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.140625, "rewards/xmlcount_reward_func": 0.38206252455711365, "step": 633 }, { "completion_length": 135.625, "epoch": 0.3392188336008561, "grad_norm": 12597.6630859375, "kl": 25.435768127441406, "learning_rate": 4.997418416197022e-06, "loss": 1.0174, "reward": 1.903937578201294, "reward_std": 0.8787837028503418, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.171875, "rewards/xmlcount_reward_func": 0.40393751859664917, "step": 634 }, { "completion_length": 145.46875, "epoch": 0.33975387907972177, "grad_norm": 6234.935546875, "kl": 543.5054321289062, "learning_rate": 4.997347216029858e-06, "loss": 21.7402, "reward": 1.0680937767028809, "reward_std": 0.57540363073349, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.171875, "rewards/xmlcount_reward_func": 0.3962187170982361, "step": 635 }, { "completion_length": 138.84375, "epoch": 0.3402889245585875, "grad_norm": 70.95686340332031, "kl": 0.8122953176498413, "learning_rate": 4.997275047843057e-06, "loss": 0.0325, "reward": 1.5123438835144043, "reward_std": 0.7032734155654907, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.109375, "rewards/xmlcount_reward_func": 0.40296873450279236, "step": 636 }, { "completion_length": 152.25, "epoch": 0.3408239700374532, "grad_norm": 2.6173007488250732, "kl": 0.14247913658618927, "learning_rate": 4.997201911664594e-06, "loss": 0.0057, "reward": 0.9345625638961792, "reward_std": 0.6692569851875305, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.046875, "rewards/xmlcount_reward_func": 0.3251875042915344, "step": 637 }, { "completion_length": 156.53125, "epoch": 0.34135901551631886, "grad_norm": 1.8304734230041504, "kl": 0.12120171636343002, "learning_rate": 4.997127807522817e-06, "loss": 0.0048, "reward": 1.0245000123977661, "reward_std": 0.7864774465560913, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.171875, "rewards/xmlcount_reward_func": 0.3526250123977661, "step": 638 }, { "completion_length": 142.0625, "epoch": 0.3418940609951846, "grad_norm": 2.8426973819732666, "kl": 0.18715256452560425, "learning_rate": 4.997052735446449e-06, "loss": 0.0075, "reward": 1.7025936841964722, "reward_std": 1.2516844272613525, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.125, "rewards/xmlcount_reward_func": 0.40571877360343933, "step": 639 }, { "completion_length": 137.9375, "epoch": 0.3424291064740503, "grad_norm": 1.4601032733917236, "kl": 0.2549961507320404, "learning_rate": 4.996976695464592e-06, "loss": 0.0102, "reward": 1.5622186660766602, "reward_std": 0.9469016790390015, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.140625, "rewards/xmlcount_reward_func": 0.4215937554836273, "step": 640 }, { "completion_length": 144.03125, "epoch": 0.342964151952916, "grad_norm": 927135.25, "kl": 61684.76171875, "learning_rate": 4.9968996876067185e-06, "loss": 2467.3906, "reward": 1.723062515258789, "reward_std": 1.0256526470184326, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.109375, "rewards/xmlcount_reward_func": 0.44181251525878906, "step": 641 }, { "completion_length": 126.78125, "epoch": 0.3434991974317817, "grad_norm": 37745.10546875, "kl": 150.343994140625, "learning_rate": 4.9968217119026795e-06, "loss": 6.0138, "reward": 1.760812520980835, "reward_std": 0.7732563018798828, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.265625, "rewards/xmlcount_reward_func": 0.4951874911785126, "step": 642 }, { "completion_length": 141.03125, "epoch": 0.3440342429106474, "grad_norm": 5.734782695770264, "kl": 0.2717173099517822, "learning_rate": 4.996742768382699e-06, "loss": 0.0109, "reward": 1.1189687252044678, "reward_std": 0.9464749097824097, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.21875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.09375, "rewards/xmlcount_reward_func": 0.36896875500679016, "step": 643 }, { "completion_length": 142.71875, "epoch": 0.3445692883895131, "grad_norm": 1.4612617492675781, "kl": 0.20422273874282837, "learning_rate": 4.9966628570773775e-06, "loss": 0.0082, "reward": 1.556249976158142, "reward_std": 0.8354780077934265, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.4468750059604645, "step": 644 }, { "completion_length": 145.90625, "epoch": 0.3451043338683788, "grad_norm": 8864420.0, "kl": 20969.662109375, "learning_rate": 4.9965819780176905e-06, "loss": 838.7865, "reward": 1.8004062175750732, "reward_std": 0.7419819831848145, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.140625, "rewards/xmlcount_reward_func": 0.425406277179718, "step": 645 }, { "completion_length": 128.65625, "epoch": 0.34563937934724454, "grad_norm": 0.9063666462898254, "kl": 0.1951771080493927, "learning_rate": 4.996500131234987e-06, "loss": 0.0078, "reward": 1.2519999742507935, "reward_std": 0.8762633204460144, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.15625, "rewards/xmlcount_reward_func": 0.43949997425079346, "step": 646 }, { "completion_length": 171.96875, "epoch": 0.3461744248261102, "grad_norm": 1.269155502319336, "kl": 0.12643957138061523, "learning_rate": 4.996417316760993e-06, "loss": 0.0051, "reward": 1.5310938358306885, "reward_std": 0.957631528377533, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.1875, "rewards/xmlcount_reward_func": 0.3748437166213989, "step": 647 }, { "completion_length": 151.09375, "epoch": 0.3467094703049759, "grad_norm": 1.4182478189468384, "kl": 0.11389486491680145, "learning_rate": 4.99633353462781e-06, "loss": 0.0046, "reward": 1.4207812547683716, "reward_std": 1.2029505968093872, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.140625, "rewards/xmlcount_reward_func": 0.4207812547683716, "step": 648 }, { "completion_length": 129.75, "epoch": 0.34724451578384163, "grad_norm": 0.9342026710510254, "kl": 0.15105551481246948, "learning_rate": 4.996248784867912e-06, "loss": 0.006, "reward": 0.9152812957763672, "reward_std": 0.47998255491256714, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.15625, "rewards/xmlcount_reward_func": 0.4309062361717224, "step": 649 }, { "completion_length": 134.84375, "epoch": 0.3477795612627073, "grad_norm": 2.9451818466186523, "kl": 0.1728583574295044, "learning_rate": 4.996163067514151e-06, "loss": 0.0069, "reward": 2.1444687843322754, "reward_std": 0.8792752623558044, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.203125, "rewards/xmlcount_reward_func": 0.456968754529953, "step": 650 }, { "completion_length": 139.59375, "epoch": 0.34831460674157305, "grad_norm": 2518171136.0, "kl": 9415098.0, "learning_rate": 4.996076382599751e-06, "loss": 376603.9375, "reward": 1.7329375743865967, "reward_std": 0.9146665930747986, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.21875, "rewards/xmlcount_reward_func": 0.4048125147819519, "step": 651 }, { "completion_length": 122.9375, "epoch": 0.34884965222043873, "grad_norm": 2.7229690551757812, "kl": 0.20055633783340454, "learning_rate": 4.995988730158315e-06, "loss": 0.008, "reward": 1.4012500047683716, "reward_std": 0.7518187761306763, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0625, "rewards/xmlcount_reward_func": 0.3700000047683716, "step": 652 }, { "completion_length": 142.8125, "epoch": 0.3493846976993044, "grad_norm": 0.9584621787071228, "kl": 0.16193035244941711, "learning_rate": 4.995900110223817e-06, "loss": 0.0065, "reward": 1.698625087738037, "reward_std": 0.8932920694351196, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.09375, "rewards/xmlcount_reward_func": 0.41737496852874756, "step": 653 }, { "completion_length": 120.5, "epoch": 0.34991974317817015, "grad_norm": 1.1302649974822998, "kl": 0.19565516710281372, "learning_rate": 4.9958105228306085e-06, "loss": 0.0078, "reward": 1.8240938186645508, "reward_std": 1.0404510498046875, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.25, "rewards/xmlcount_reward_func": 0.480343759059906, "step": 654 }, { "completion_length": 133.375, "epoch": 0.35045478865703583, "grad_norm": 1.1027365922927856, "kl": 0.1657431423664093, "learning_rate": 4.995719968013415e-06, "loss": 0.0066, "reward": 1.2359063625335693, "reward_std": 0.7187906503677368, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.203125, "rewards/xmlcount_reward_func": 0.4390312731266022, "step": 655 }, { "completion_length": 157.84375, "epoch": 0.35098983413590157, "grad_norm": 1.6712409257888794, "kl": 0.11860720813274384, "learning_rate": 4.9956284458073366e-06, "loss": 0.0047, "reward": 0.8655624985694885, "reward_std": 0.6454119682312012, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0625, "rewards/xmlcount_reward_func": 0.3655624985694885, "step": 656 }, { "completion_length": 134.125, "epoch": 0.35152487961476725, "grad_norm": 4.971541881561279, "kl": 0.17569836974143982, "learning_rate": 4.995535956247851e-06, "loss": 0.007, "reward": 1.5906250476837158, "reward_std": 0.9654603004455566, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.15625, "rewards/xmlcount_reward_func": 0.41874998807907104, "step": 657 }, { "completion_length": 147.625, "epoch": 0.352059925093633, "grad_norm": 87.5777359008789, "kl": 0.281066358089447, "learning_rate": 4.995442499370807e-06, "loss": 0.0112, "reward": 1.4424062967300415, "reward_std": 1.021539568901062, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.15625, "rewards/xmlcount_reward_func": 0.42678123712539673, "step": 658 }, { "completion_length": 149.625, "epoch": 0.35259497057249867, "grad_norm": 5.569086074829102, "kl": 0.16595490276813507, "learning_rate": 4.995348075212431e-06, "loss": 0.0066, "reward": 1.4609375, "reward_std": 0.81340092420578, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.15625, "rewards/xmlcount_reward_func": 0.3828125, "step": 659 }, { "completion_length": 157.40625, "epoch": 0.35313001605136435, "grad_norm": 2.4558143615722656, "kl": 0.15524426102638245, "learning_rate": 4.995252683809324e-06, "loss": 0.0062, "reward": 1.6999688148498535, "reward_std": 0.85500168800354, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.15625, "rewards/xmlcount_reward_func": 0.43434375524520874, "step": 660 }, { "completion_length": 125.03125, "epoch": 0.3536650615302301, "grad_norm": 4.963600158691406, "kl": 0.23386293649673462, "learning_rate": 4.995156325198461e-06, "loss": 0.0094, "reward": 1.863031268119812, "reward_std": 0.8771955966949463, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.140625, "rewards/xmlcount_reward_func": 0.472406268119812, "step": 661 }, { "completion_length": 129.03125, "epoch": 0.35420010700909577, "grad_norm": 2.6273913383483887, "kl": 0.15894216299057007, "learning_rate": 4.995058999417192e-06, "loss": 0.0064, "reward": 1.6687812805175781, "reward_std": 0.6695970296859741, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.25, "rewards/xmlcount_reward_func": 0.4656562805175781, "step": 662 }, { "completion_length": 160.28125, "epoch": 0.3547351524879615, "grad_norm": 0.9304214119911194, "kl": 0.11861728876829147, "learning_rate": 4.994960706503244e-06, "loss": 0.0047, "reward": 1.1604063510894775, "reward_std": 0.6522258520126343, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.1875, "rewards/xmlcount_reward_func": 0.3947812616825104, "step": 663 }, { "completion_length": 158.9375, "epoch": 0.3552701979668272, "grad_norm": 0.8014692664146423, "kl": 0.11385706812143326, "learning_rate": 4.994861446494717e-06, "loss": 0.0046, "reward": 1.0726561546325684, "reward_std": 1.0354738235473633, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.21875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.171875, "rewards/xmlcount_reward_func": 0.3695312738418579, "step": 664 }, { "completion_length": 148.5, "epoch": 0.35580524344569286, "grad_norm": 1.5114284753799438, "kl": 0.13425913453102112, "learning_rate": 4.994761219430084e-06, "loss": 0.0054, "reward": 1.2825312614440918, "reward_std": 0.8561055660247803, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.21875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.125, "rewards/xmlcount_reward_func": 0.3762812614440918, "step": 665 }, { "completion_length": 153.09375, "epoch": 0.3563402889245586, "grad_norm": 553856.3125, "kl": 22391.833984375, "learning_rate": 4.9946600253481966e-06, "loss": 895.6735, "reward": 1.261812448501587, "reward_std": 0.9045412540435791, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.203125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.171875, "rewards/xmlcount_reward_func": 0.3868125081062317, "step": 666 }, { "completion_length": 123.6875, "epoch": 0.3568753344034243, "grad_norm": 0.9512283205986023, "kl": 0.13925880193710327, "learning_rate": 4.994557864288278e-06, "loss": 0.0056, "reward": 1.8156875371932983, "reward_std": 0.6263072490692139, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.265625, "rewards/xmlcount_reward_func": 0.47193750739097595, "step": 667 }, { "completion_length": 138.84375, "epoch": 0.35741037988229, "grad_norm": 2.1315276622772217, "kl": 0.19271384179592133, "learning_rate": 4.994454736289931e-06, "loss": 0.0077, "reward": 0.7760937213897705, "reward_std": 0.5442997217178345, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.1875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.078125, "rewards/xmlcount_reward_func": 0.3854687511920929, "step": 668 }, { "completion_length": 128.96875, "epoch": 0.3579454253611557, "grad_norm": 7.005695819854736, "kl": 0.20590484142303467, "learning_rate": 4.994350641393127e-06, "loss": 0.0082, "reward": 1.8816249370574951, "reward_std": 1.0765771865844727, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.171875, "rewards/xmlcount_reward_func": 0.4597499966621399, "step": 669 }, { "completion_length": 150.125, "epoch": 0.3584804708400214, "grad_norm": 1.6123384237289429, "kl": 0.15723133087158203, "learning_rate": 4.9942455796382154e-06, "loss": 0.0063, "reward": 1.3800938129425049, "reward_std": 0.6391048431396484, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.109375, "rewards/xmlcount_reward_func": 0.3800937831401825, "step": 670 }, { "completion_length": 171.625, "epoch": 0.3590155163188871, "grad_norm": 1.062432050704956, "kl": 0.11802569776773453, "learning_rate": 4.994139551065922e-06, "loss": 0.0047, "reward": 1.27943754196167, "reward_std": 1.0667768716812134, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.125, "rewards/xmlcount_reward_func": 0.35756251215934753, "step": 671 }, { "completion_length": 125.4375, "epoch": 0.3595505617977528, "grad_norm": 0.9177863001823425, "kl": 0.16641323268413544, "learning_rate": 4.994032555717343e-06, "loss": 0.0067, "reward": 1.847156286239624, "reward_std": 0.9532105922698975, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.125, "rewards/xmlcount_reward_func": 0.47215625643730164, "step": 672 }, { "completion_length": 120.46875, "epoch": 0.36008560727661854, "grad_norm": 8.2626371383667, "kl": 0.27598464488983154, "learning_rate": 4.9939245936339545e-06, "loss": 0.011, "reward": 1.7724063396453857, "reward_std": 0.8330623507499695, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.21875, "rewards/xmlcount_reward_func": 0.49115628004074097, "step": 673 }, { "completion_length": 155.125, "epoch": 0.3606206527554842, "grad_norm": 1.1742876768112183, "kl": 0.20789597928524017, "learning_rate": 4.993815664857603e-06, "loss": 0.0083, "reward": 0.9259375333786011, "reward_std": 0.7645606994628906, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.234375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.1875, "rewards/xmlcount_reward_func": 0.3790625333786011, "step": 674 }, { "completion_length": 135.84375, "epoch": 0.3611556982343499, "grad_norm": 0.7581139802932739, "kl": 0.16542485356330872, "learning_rate": 4.9937057694305115e-06, "loss": 0.0066, "reward": 2.116468906402588, "reward_std": 1.017704725265503, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.28125, "rewards/xmlcount_reward_func": 0.47584378719329834, "step": 675 }, { "completion_length": 121.8125, "epoch": 0.36169074371321563, "grad_norm": 1.5549986362457275, "kl": 0.22343891859054565, "learning_rate": 4.993594907395278e-06, "loss": 0.0089, "reward": 2.2246251106262207, "reward_std": 0.704434871673584, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.09375, "rewards/xmlcount_reward_func": 0.47462499141693115, "step": 676 }, { "completion_length": 140.25, "epoch": 0.3622257891920813, "grad_norm": 2.9856009483337402, "kl": 0.18495966494083405, "learning_rate": 4.9934830787948756e-06, "loss": 0.0074, "reward": 1.2331874370574951, "reward_std": 0.5871777534484863, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.15625, "rewards/xmlcount_reward_func": 0.4050624966621399, "step": 677 }, { "completion_length": 119.625, "epoch": 0.36276083467094705, "grad_norm": 0.66878741979599, "kl": 0.1819850504398346, "learning_rate": 4.9933702836726486e-06, "loss": 0.0073, "reward": 1.816656231880188, "reward_std": 0.6729620695114136, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.457281231880188, "step": 678 }, { "completion_length": 128.6875, "epoch": 0.36329588014981273, "grad_norm": 0.9952479004859924, "kl": 0.14309045672416687, "learning_rate": 4.99325652207232e-06, "loss": 0.0057, "reward": 1.6835625171661377, "reward_std": 0.5458971261978149, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.4804375171661377, "step": 679 }, { "completion_length": 156.53125, "epoch": 0.3638309256286784, "grad_norm": 1.2561267614364624, "kl": 0.13008356094360352, "learning_rate": 4.993141794037988e-06, "loss": 0.0052, "reward": 0.9952500462532043, "reward_std": 0.7240844368934631, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.15625, "rewards/xmlcount_reward_func": 0.38587498664855957, "step": 680 }, { "completion_length": 140.21875, "epoch": 0.36436597110754415, "grad_norm": 5.835977554321289, "kl": 1.5433262586593628, "learning_rate": 4.99302609961412e-06, "loss": 0.0617, "reward": 1.9605000019073486, "reward_std": 0.9172965288162231, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.47612500190734863, "step": 681 }, { "completion_length": 161.375, "epoch": 0.36490101658640983, "grad_norm": 35344.41015625, "kl": 152.8528594970703, "learning_rate": 4.992909438845563e-06, "loss": 6.1141, "reward": 1.241281270980835, "reward_std": 0.763114333152771, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.21875, "rewards/xmlcount_reward_func": 0.4131562411785126, "step": 682 }, { "completion_length": 118.0625, "epoch": 0.36543606206527557, "grad_norm": 0.7902030348777771, "kl": 0.17821431159973145, "learning_rate": 4.992791811777538e-06, "loss": 0.0071, "reward": 2.5583438873291016, "reward_std": 1.021491289138794, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.265625, "rewards/xmlcount_reward_func": 0.464593768119812, "step": 683 }, { "completion_length": 126.09375, "epoch": 0.36597110754414125, "grad_norm": 2.6249899864196777, "kl": 0.2909984588623047, "learning_rate": 4.992673218455637e-06, "loss": 0.0116, "reward": 1.693656325340271, "reward_std": 0.9273364543914795, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.25, "rewards/xmlcount_reward_func": 0.4749062657356262, "step": 684 }, { "completion_length": 150.03125, "epoch": 0.36650615302300693, "grad_norm": 1.6738852262496948, "kl": 0.186381995677948, "learning_rate": 4.992553658925831e-06, "loss": 0.0075, "reward": 1.2443437576293945, "reward_std": 0.7023878693580627, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.171875, "rewards/xmlcount_reward_func": 0.41621875762939453, "step": 685 }, { "completion_length": 132.59375, "epoch": 0.36704119850187267, "grad_norm": 1429.8834228515625, "kl": 4.385125160217285, "learning_rate": 4.9924331332344634e-06, "loss": 0.1754, "reward": 1.6193749904632568, "reward_std": 0.5428872108459473, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.25, "rewards/xmlcount_reward_func": 0.44749999046325684, "step": 686 }, { "completion_length": 117.3125, "epoch": 0.36757624398073835, "grad_norm": 0.9394388198852539, "kl": 0.22261527180671692, "learning_rate": 4.992311641428251e-06, "loss": 0.0089, "reward": 1.7165625095367432, "reward_std": 0.9849204421043396, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.48218750953674316, "step": 687 }, { "completion_length": 125.53125, "epoch": 0.3681112894596041, "grad_norm": 1.7616370916366577, "kl": 0.18058137595653534, "learning_rate": 4.992189183554288e-06, "loss": 0.0072, "reward": 1.2421875, "reward_std": 0.29167506098747253, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4765625, "step": 688 }, { "completion_length": 151.625, "epoch": 0.36864633493846977, "grad_norm": 1.0239166021347046, "kl": 0.14090275764465332, "learning_rate": 4.99206575966004e-06, "loss": 0.0056, "reward": 1.6303750276565552, "reward_std": 1.0354160070419312, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.25, "rewards/xmlcount_reward_func": 0.3647499978542328, "step": 689 }, { "completion_length": 119.75, "epoch": 0.36918138041733545, "grad_norm": 16.810924530029297, "kl": 0.1992080807685852, "learning_rate": 4.99194136979335e-06, "loss": 0.008, "reward": 1.8940000534057617, "reward_std": 1.0194082260131836, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.234375, "rewards/xmlcount_reward_func": 0.44087502360343933, "step": 690 }, { "completion_length": 147.875, "epoch": 0.3697164258962012, "grad_norm": 43090.4921875, "kl": 1032.63427734375, "learning_rate": 4.991816014002432e-06, "loss": 41.3054, "reward": 1.0703125, "reward_std": 0.5756744146347046, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.25, "rewards/xmlcount_reward_func": 0.4296875, "step": 691 }, { "completion_length": 158.03125, "epoch": 0.37025147137506687, "grad_norm": 0.7965002655982971, "kl": 0.12420654296875, "learning_rate": 4.991689692335877e-06, "loss": 0.005, "reward": 1.3708124160766602, "reward_std": 1.0833549499511719, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.125, "rewards/xmlcount_reward_func": 0.4020625054836273, "step": 692 }, { "completion_length": 136.6875, "epoch": 0.3707865168539326, "grad_norm": 2923.044921875, "kl": 38.239540100097656, "learning_rate": 4.991562404842651e-06, "loss": 1.5296, "reward": 1.9237812757492065, "reward_std": 1.0507457256317139, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.203125, "rewards/xmlcount_reward_func": 0.42378124594688416, "step": 693 }, { "completion_length": 134.375, "epoch": 0.3713215623327983, "grad_norm": 0.9882270097732544, "kl": 0.1723557710647583, "learning_rate": 4.991434151572091e-06, "loss": 0.0069, "reward": 1.2887499332427979, "reward_std": 0.8889658451080322, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.21875, "rewards/xmlcount_reward_func": 0.4449999928474426, "step": 694 }, { "completion_length": 142.65625, "epoch": 0.37185660781166396, "grad_norm": 2.607856512069702, "kl": 0.2949422001838684, "learning_rate": 4.991304932573912e-06, "loss": 0.0118, "reward": 1.3932812213897705, "reward_std": 1.0084095001220703, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.28125, "rewards/xmlcount_reward_func": 0.4245312511920929, "step": 695 }, { "completion_length": 141.5625, "epoch": 0.3723916532905297, "grad_norm": 1.5963730812072754, "kl": 0.22313383221626282, "learning_rate": 4.991174747898201e-06, "loss": 0.0089, "reward": 1.234375, "reward_std": 0.46712765097618103, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.46875, "step": 696 }, { "completion_length": 132.0, "epoch": 0.3729266987693954, "grad_norm": 1.5391879081726074, "kl": 0.2816855013370514, "learning_rate": 4.99104359759542e-06, "loss": 0.0113, "reward": 1.7304062843322754, "reward_std": 0.9956585168838501, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.464781254529953, "step": 697 }, { "completion_length": 138.125, "epoch": 0.3734617442482611, "grad_norm": 2.0695741176605225, "kl": 0.2035520076751709, "learning_rate": 4.9909114817164065e-06, "loss": 0.0081, "reward": 1.5724687576293945, "reward_std": 0.6914826035499573, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.234375, "rewards/xmlcount_reward_func": 0.46309375762939453, "step": 698 }, { "completion_length": 117.34375, "epoch": 0.3739967897271268, "grad_norm": 0.7853165864944458, "kl": 0.20062202215194702, "learning_rate": 4.99077840031237e-06, "loss": 0.008, "reward": 2.113156318664551, "reward_std": 1.0020618438720703, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.234375, "rewards/xmlcount_reward_func": 0.488156259059906, "step": 699 }, { "completion_length": 107.5, "epoch": 0.37453183520599254, "grad_norm": 2281124096.0, "kl": 75494592.0, "learning_rate": 4.990644353434895e-06, "loss": 3019783.5, "reward": 2.3421249389648438, "reward_std": 0.7680714130401611, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.4983749985694885, "step": 700 }, { "completion_length": 130.84375, "epoch": 0.3750668806848582, "grad_norm": 0.7870486974716187, "kl": 0.19250981509685516, "learning_rate": 4.990509341135942e-06, "loss": 0.0077, "reward": 1.6020936965942383, "reward_std": 0.7967712879180908, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.46146875619888306, "step": 701 }, { "completion_length": 133.125, "epoch": 0.3756019261637239, "grad_norm": 1.6343982219696045, "kl": 0.18466100096702576, "learning_rate": 4.990373363467844e-06, "loss": 0.0074, "reward": 1.8640313148498535, "reward_std": 0.8769727945327759, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.125, "rewards/xmlcount_reward_func": 0.42653125524520874, "step": 702 }, { "completion_length": 132.9375, "epoch": 0.37613697164258963, "grad_norm": 4.475081443786621, "kl": 0.5074173212051392, "learning_rate": 4.990236420483307e-06, "loss": 0.0203, "reward": 1.6829062700271606, "reward_std": 0.8064869046211243, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.46415624022483826, "step": 703 }, { "completion_length": 152.9375, "epoch": 0.3766720171214553, "grad_norm": 6.27918004989624, "kl": 0.18047866225242615, "learning_rate": 4.9900985122354154e-06, "loss": 0.0072, "reward": 1.308000087738037, "reward_std": 0.5913288593292236, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.1875, "rewards/xmlcount_reward_func": 0.41737502813339233, "step": 704 }, { "completion_length": 128.6875, "epoch": 0.37720706260032105, "grad_norm": 0.9340031147003174, "kl": 0.15563634037971497, "learning_rate": 4.989959638777624e-06, "loss": 0.0062, "reward": 2.189687490463257, "reward_std": 0.7595931887626648, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.1875, "rewards/xmlcount_reward_func": 0.47093749046325684, "step": 705 }, { "completion_length": 125.96875, "epoch": 0.37774210807918673, "grad_norm": 1.3658231496810913, "kl": 0.1962454468011856, "learning_rate": 4.989819800163761e-06, "loss": 0.0078, "reward": 1.9900000095367432, "reward_std": 1.020371675491333, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.28125, "rewards/xmlcount_reward_func": 0.44312500953674316, "step": 706 }, { "completion_length": 156.59375, "epoch": 0.3782771535580524, "grad_norm": 1.2001804113388062, "kl": 0.1095569059252739, "learning_rate": 4.989678996448033e-06, "loss": 0.0044, "reward": 1.5058125257492065, "reward_std": 1.0042606592178345, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.171875, "rewards/xmlcount_reward_func": 0.41206252574920654, "step": 707 }, { "completion_length": 124.40625, "epoch": 0.37881219903691815, "grad_norm": 1.391514539718628, "kl": 0.23700934648513794, "learning_rate": 4.989537227685017e-06, "loss": 0.0095, "reward": 2.5017499923706055, "reward_std": 0.853127658367157, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.203125, "rewards/xmlcount_reward_func": 0.48612499237060547, "step": 708 }, { "completion_length": 110.8125, "epoch": 0.37934724451578383, "grad_norm": 94.46836853027344, "kl": 2.874846935272217, "learning_rate": 4.989394493929665e-06, "loss": 0.115, "reward": 2.437375068664551, "reward_std": 1.1358625888824463, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.499875009059906, "step": 709 }, { "completion_length": 134.1875, "epoch": 0.37988228999464957, "grad_norm": 0.8861997723579407, "kl": 0.18121281266212463, "learning_rate": 4.989250795237304e-06, "loss": 0.0072, "reward": 1.4341875314712524, "reward_std": 0.6074366569519043, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.25, "rewards/xmlcount_reward_func": 0.41856250166893005, "step": 710 }, { "completion_length": 127.65625, "epoch": 0.38041733547351525, "grad_norm": 3.3907928466796875, "kl": 0.1630147099494934, "learning_rate": 4.989106131663635e-06, "loss": 0.0065, "reward": 2.0775938034057617, "reward_std": 0.6969788074493408, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.21875, "rewards/xmlcount_reward_func": 0.45259374380111694, "step": 711 }, { "completion_length": 151.375, "epoch": 0.38095238095238093, "grad_norm": 11.284249305725098, "kl": 0.15795084834098816, "learning_rate": 4.98896050326473e-06, "loss": 0.0063, "reward": 1.519124984741211, "reward_std": 0.8825699687004089, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.265625, "rewards/xmlcount_reward_func": 0.42537498474121094, "step": 712 }, { "completion_length": 139.71875, "epoch": 0.38148742643124667, "grad_norm": 0.600605845451355, "kl": 0.1403595507144928, "learning_rate": 4.988813910097041e-06, "loss": 0.0056, "reward": 1.4356250762939453, "reward_std": 0.670153021812439, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.234375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.265625, "rewards/xmlcount_reward_func": 0.43562501668930054, "step": 713 }, { "completion_length": 167.5625, "epoch": 0.38202247191011235, "grad_norm": 1.1690540313720703, "kl": 0.11593298614025116, "learning_rate": 4.988666352217387e-06, "loss": 0.0046, "reward": 1.1311562061309814, "reward_std": 0.8012678027153015, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.203125, "rewards/xmlcount_reward_func": 0.4124062657356262, "step": 714 }, { "completion_length": 133.40625, "epoch": 0.3825575173889781, "grad_norm": 2.007364511489868, "kl": 0.1908898502588272, "learning_rate": 4.988517829682966e-06, "loss": 0.0076, "reward": 1.9437811374664307, "reward_std": 0.9985357522964478, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.25, "rewards/xmlcount_reward_func": 0.4281562566757202, "step": 715 }, { "completion_length": 141.625, "epoch": 0.38309256286784377, "grad_norm": 0.557604968547821, "kl": 0.1364486962556839, "learning_rate": 4.988368342551347e-06, "loss": 0.0055, "reward": 1.31640625, "reward_std": 0.6259641647338867, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.265625, "rewards/xmlcount_reward_func": 0.48828125, "step": 716 }, { "completion_length": 149.75, "epoch": 0.38362760834670945, "grad_norm": 8538718.0, "kl": 577236.75, "learning_rate": 4.988217890880475e-06, "loss": 23089.4766, "reward": 1.6258749961853027, "reward_std": 0.8364758491516113, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.125, "rewards/xmlcount_reward_func": 0.37587499618530273, "step": 717 }, { "completion_length": 163.125, "epoch": 0.3841626538255752, "grad_norm": 2.0272326469421387, "kl": 0.2231968492269516, "learning_rate": 4.9880664747286684e-06, "loss": 0.0089, "reward": 1.2836250066757202, "reward_std": 1.0038204193115234, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.234375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.15625, "rewards/xmlcount_reward_func": 0.3930000066757202, "step": 718 }, { "completion_length": 135.84375, "epoch": 0.38469769930444087, "grad_norm": 1.421595573425293, "kl": 0.1875918209552765, "learning_rate": 4.987914094154618e-06, "loss": 0.0075, "reward": 1.8599375486373901, "reward_std": 0.6646931171417236, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.265625, "rewards/xmlcount_reward_func": 0.46931248903274536, "step": 719 }, { "completion_length": 124.625, "epoch": 0.3852327447833066, "grad_norm": 643882.1875, "kl": 2237.380126953125, "learning_rate": 4.98776074921739e-06, "loss": 89.4952, "reward": 2.451624870300293, "reward_std": 0.7169184684753418, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.25, "rewards/xmlcount_reward_func": 0.4984999895095825, "step": 720 }, { "completion_length": 133.9375, "epoch": 0.3857677902621723, "grad_norm": 0.9758539795875549, "kl": 0.15324202179908752, "learning_rate": 4.987606439976423e-06, "loss": 0.0061, "reward": 1.7460312843322754, "reward_std": 1.0240585803985596, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.1875, "rewards/xmlcount_reward_func": 0.449156254529953, "step": 721 }, { "completion_length": 114.59375, "epoch": 0.38630283574103796, "grad_norm": 2.027873992919922, "kl": 0.21622294187545776, "learning_rate": 4.987451166491531e-06, "loss": 0.0086, "reward": 1.464593768119812, "reward_std": 0.7758100032806396, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.495843768119812, "step": 722 }, { "completion_length": 152.21875, "epoch": 0.3868378812199037, "grad_norm": 0.954278826713562, "kl": 0.17934544384479523, "learning_rate": 4.987294928822901e-06, "loss": 0.0072, "reward": 1.4266562461853027, "reward_std": 1.0226136445999146, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.203125, "rewards/xmlcount_reward_func": 0.41103124618530273, "step": 723 }, { "completion_length": 142.0625, "epoch": 0.3873729266987694, "grad_norm": 88.19017791748047, "kl": 12.839734077453613, "learning_rate": 4.987137727031093e-06, "loss": 0.5136, "reward": 1.450624942779541, "reward_std": 0.8779692649841309, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.28125, "rewards/xmlcount_reward_func": 0.4506250023841858, "step": 724 }, { "completion_length": 129.03125, "epoch": 0.3879079721776351, "grad_norm": 13736.978515625, "kl": 175.1210479736328, "learning_rate": 4.986979561177043e-06, "loss": 7.0048, "reward": 2.2183125019073486, "reward_std": 1.0774662494659424, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.25, "rewards/xmlcount_reward_func": 0.46831250190734863, "step": 725 }, { "completion_length": 124.21875, "epoch": 0.3884430176565008, "grad_norm": 0.6631684899330139, "kl": 0.17078527808189392, "learning_rate": 4.986820431322057e-06, "loss": 0.0068, "reward": 1.5920937061309814, "reward_std": 0.9552451372146606, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.265625, "rewards/xmlcount_reward_func": 0.4670937657356262, "step": 726 }, { "completion_length": 122.6875, "epoch": 0.3889780631353665, "grad_norm": 0.683224618434906, "kl": 0.1517099142074585, "learning_rate": 4.986660337527819e-06, "loss": 0.0061, "reward": 2.1477813720703125, "reward_std": 0.8890906572341919, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.265625, "rewards/xmlcount_reward_func": 0.46028125286102295, "step": 727 }, { "completion_length": 155.59375, "epoch": 0.3895131086142322, "grad_norm": 244177.953125, "kl": 1026.3297119140625, "learning_rate": 4.9864992798563826e-06, "loss": 41.0532, "reward": 1.3327813148498535, "reward_std": 0.6884583234786987, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.44215625524520874, "step": 728 }, { "completion_length": 168.25, "epoch": 0.3900481540930979, "grad_norm": 0.8833492994308472, "kl": 0.14034079015254974, "learning_rate": 4.9863372583701765e-06, "loss": 0.0056, "reward": 0.6961250305175781, "reward_std": 0.40845292806625366, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.21875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.140625, "rewards/xmlcount_reward_func": 0.33675000071525574, "step": 729 }, { "completion_length": 143.34375, "epoch": 0.39058319957196364, "grad_norm": 1.11428701877594, "kl": 0.15376317501068115, "learning_rate": 4.986174273132006e-06, "loss": 0.0062, "reward": 2.217937469482422, "reward_std": 1.0255444049835205, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.43668749928474426, "step": 730 }, { "completion_length": 134.1875, "epoch": 0.3911182450508293, "grad_norm": 1.0696545839309692, "kl": 0.14756274223327637, "learning_rate": 4.9860103242050455e-06, "loss": 0.0059, "reward": 2.191281318664551, "reward_std": 1.0703459978103638, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.488156259059906, "step": 731 }, { "completion_length": 140.96875, "epoch": 0.391653290529695, "grad_norm": 34.635807037353516, "kl": 3.295614242553711, "learning_rate": 4.985845411652843e-06, "loss": 0.1318, "reward": 1.6240313053131104, "reward_std": 0.9223899841308594, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.234375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.4365312457084656, "step": 732 }, { "completion_length": 122.125, "epoch": 0.39218833600856073, "grad_norm": 3.4785633087158203, "kl": 0.2249009609222412, "learning_rate": 4.985679535539326e-06, "loss": 0.009, "reward": 2.01171875, "reward_std": 1.0834839344024658, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.46484375, "step": 733 }, { "completion_length": 131.96875, "epoch": 0.3927233814874264, "grad_norm": 3.4041287899017334, "kl": 0.17392848432064056, "learning_rate": 4.985512695928788e-06, "loss": 0.007, "reward": 2.02734375, "reward_std": 0.7234352827072144, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.46484375, "step": 734 }, { "completion_length": 156.59375, "epoch": 0.39325842696629215, "grad_norm": 1.8457138538360596, "kl": 0.21907375752925873, "learning_rate": 4.985344892885899e-06, "loss": 0.0088, "reward": 1.269687533378601, "reward_std": 0.6068546772003174, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.234375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.203125, "rewards/xmlcount_reward_func": 0.3946875035762787, "step": 735 }, { "completion_length": 141.84375, "epoch": 0.39379347244515783, "grad_norm": 2.5718774795532227, "kl": 0.16762152314186096, "learning_rate": 4.985176126475706e-06, "loss": 0.0067, "reward": 1.5159687995910645, "reward_std": 0.9064605236053467, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4534687399864197, "step": 736 }, { "completion_length": 126.96875, "epoch": 0.39432851792402357, "grad_norm": 2.734041213989258, "kl": 0.4286247193813324, "learning_rate": 4.985006396763622e-06, "loss": 0.0171, "reward": 1.5703125, "reward_std": 0.56721031665802, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.28125, "rewards/xmlcount_reward_func": 0.4765625, "step": 737 }, { "completion_length": 136.5, "epoch": 0.39486356340288925, "grad_norm": 5.277307510375977, "kl": 0.1962190568447113, "learning_rate": 4.984835703815441e-06, "loss": 0.0078, "reward": 2.1711249351501465, "reward_std": 0.8787245750427246, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.25, "rewards/xmlcount_reward_func": 0.48362499475479126, "step": 738 }, { "completion_length": 105.8125, "epoch": 0.39539860888175493, "grad_norm": 6.189037322998047, "kl": 0.18702925741672516, "learning_rate": 4.984664047697324e-06, "loss": 0.0075, "reward": 1.6031875610351562, "reward_std": 0.6506719589233398, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4938125014305115, "step": 739 }, { "completion_length": 147.65625, "epoch": 0.39593365436062067, "grad_norm": 9.30046272277832, "kl": 0.19664737582206726, "learning_rate": 4.98449142847581e-06, "loss": 0.0079, "reward": 1.5373749732971191, "reward_std": 0.7603803873062134, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.265625, "rewards/xmlcount_reward_func": 0.39675000309944153, "step": 740 }, { "completion_length": 145.625, "epoch": 0.39646869983948635, "grad_norm": 2890067.5, "kl": 17108.904296875, "learning_rate": 4.984317846217808e-06, "loss": 684.3561, "reward": 2.0317187309265137, "reward_std": 0.9796345233917236, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.46921876072883606, "step": 741 }, { "completion_length": 119.78125, "epoch": 0.3970037453183521, "grad_norm": 2.3660929203033447, "kl": 0.21332406997680664, "learning_rate": 4.9841433009906036e-06, "loss": 0.0085, "reward": 2.346468687057495, "reward_std": 1.2064414024353027, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.4558437466621399, "step": 742 }, { "completion_length": 111.9375, "epoch": 0.39753879079721777, "grad_norm": 1.641915202140808, "kl": 0.21692092716693878, "learning_rate": 4.983967792861854e-06, "loss": 0.0087, "reward": 2.36328125, "reward_std": 0.6838740110397339, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.48828125, "step": 743 }, { "completion_length": 137.75, "epoch": 0.39807383627608345, "grad_norm": 693.3244018554688, "kl": 2.2465853691101074, "learning_rate": 4.983791321899587e-06, "loss": 0.0899, "reward": 1.7237187623977661, "reward_std": 0.9924967288970947, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.25, "rewards/xmlcount_reward_func": 0.4580937623977661, "step": 744 }, { "completion_length": 121.28125, "epoch": 0.3986088817549492, "grad_norm": 2.6539483070373535, "kl": 0.23963165283203125, "learning_rate": 4.983613888172208e-06, "loss": 0.0096, "reward": 2.1270313262939453, "reward_std": 0.9133129119873047, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.45515626668930054, "step": 745 }, { "completion_length": 135.9375, "epoch": 0.39914392723381487, "grad_norm": 159612912.0, "kl": 2061215.875, "learning_rate": 4.983435491748494e-06, "loss": 82448.6406, "reward": 2.190000057220459, "reward_std": 0.9363522529602051, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4712499976158142, "step": 746 }, { "completion_length": 131.75, "epoch": 0.3996789727126806, "grad_norm": 885217856.0, "kl": 10441741.0, "learning_rate": 4.983256132697594e-06, "loss": 417669.625, "reward": 1.4626874923706055, "reward_std": 0.8384200930595398, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.25, "rewards/xmlcount_reward_func": 0.43143749237060547, "step": 747 }, { "completion_length": 126.25, "epoch": 0.4002140181915463, "grad_norm": 0.6163221597671509, "kl": 0.16754402220249176, "learning_rate": 4.983075811089032e-06, "loss": 0.0067, "reward": 1.9605000019073486, "reward_std": 0.7113668918609619, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.46050000190734863, "step": 748 }, { "completion_length": 152.40625, "epoch": 0.40074906367041196, "grad_norm": 1.030328392982483, "kl": 0.177440345287323, "learning_rate": 4.982894526992702e-06, "loss": 0.0071, "reward": 1.519281268119812, "reward_std": 0.9927085638046265, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.25, "rewards/xmlcount_reward_func": 0.4099062383174896, "step": 749 }, { "completion_length": 152.75, "epoch": 0.4012841091492777, "grad_norm": 17.686548233032227, "kl": 0.5982331037521362, "learning_rate": 4.982712280478875e-06, "loss": 0.0239, "reward": 1.6159999370574951, "reward_std": 0.6530873775482178, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.171875, "rewards/xmlcount_reward_func": 0.3816249966621399, "step": 750 }, { "completion_length": 144.15625, "epoch": 0.4018191546281434, "grad_norm": 0.5252220034599304, "kl": 0.12244142591953278, "learning_rate": 4.982529071618194e-06, "loss": 0.0049, "reward": 1.4150936603546143, "reward_std": 0.7397817969322205, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.28125, "rewards/xmlcount_reward_func": 0.4619687497615814, "step": 751 }, { "completion_length": 131.9375, "epoch": 0.4023542001070091, "grad_norm": 29749.904296875, "kl": 100.53783416748047, "learning_rate": 4.982344900481672e-06, "loss": 4.0215, "reward": 1.3597500324249268, "reward_std": 0.5517248511314392, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.265625, "rewards/xmlcount_reward_func": 0.48475000262260437, "step": 752 }, { "completion_length": 117.28125, "epoch": 0.4028892455858748, "grad_norm": 79.36982727050781, "kl": 1.161825180053711, "learning_rate": 4.982159767140699e-06, "loss": 0.0465, "reward": 2.5, "reward_std": 0.9466298818588257, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.5, "step": 753 }, { "completion_length": 166.9375, "epoch": 0.4034242910647405, "grad_norm": 1.1337422132492065, "kl": 0.13063858449459076, "learning_rate": 4.981973671667035e-06, "loss": 0.0052, "reward": 1.0926874876022339, "reward_std": 0.9476085901260376, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.234375, "rewards/xmlcount_reward_func": 0.3583124876022339, "step": 754 }, { "completion_length": 122.09375, "epoch": 0.4039593365436062, "grad_norm": 0.7925743460655212, "kl": 0.17699438333511353, "learning_rate": 4.981786614132815e-06, "loss": 0.0071, "reward": 2.046875, "reward_std": 0.5881871581077576, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.5, "step": 755 }, { "completion_length": 160.625, "epoch": 0.4044943820224719, "grad_norm": 2180.98583984375, "kl": 423.7307434082031, "learning_rate": 4.981598594610545e-06, "loss": 16.9492, "reward": 1.6941875219345093, "reward_std": 0.8892258405685425, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.28125, "rewards/xmlcount_reward_func": 0.4285624921321869, "step": 756 }, { "completion_length": 141.6875, "epoch": 0.40502942750133764, "grad_norm": 1.1833761930465698, "kl": 0.15385156869888306, "learning_rate": 4.981409613173107e-06, "loss": 0.0062, "reward": 2.0432188510894775, "reward_std": 0.8041025400161743, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4338437616825104, "step": 757 }, { "completion_length": 133.75, "epoch": 0.4055644729802033, "grad_norm": 5564.3876953125, "kl": 96.81336212158203, "learning_rate": 4.981219669893752e-06, "loss": 3.8725, "reward": 2.099656105041504, "reward_std": 1.03420090675354, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.42778125405311584, "step": 758 }, { "completion_length": 146.5, "epoch": 0.406099518459069, "grad_norm": 0.6117326617240906, "kl": 0.15930849313735962, "learning_rate": 4.981028764846106e-06, "loss": 0.0064, "reward": 1.7428749799728394, "reward_std": 0.8915215134620667, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.28125, "rewards/xmlcount_reward_func": 0.41475000977516174, "step": 759 }, { "completion_length": 155.71875, "epoch": 0.40663456393793473, "grad_norm": 0.9803729057312012, "kl": 0.1439577043056488, "learning_rate": 4.980836898104169e-06, "loss": 0.0058, "reward": 1.6219687461853027, "reward_std": 0.7546485066413879, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.40321874618530273, "step": 760 }, { "completion_length": 154.96875, "epoch": 0.4071696094168004, "grad_norm": 2.065403938293457, "kl": 0.13572193682193756, "learning_rate": 4.98064406974231e-06, "loss": 0.0054, "reward": 1.0898125171661377, "reward_std": 0.7948707342147827, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.21875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.25, "rewards/xmlcount_reward_func": 0.3710624873638153, "step": 761 }, { "completion_length": 152.375, "epoch": 0.40770465489566615, "grad_norm": 12.251730918884277, "kl": 0.17205239832401276, "learning_rate": 4.980450279835275e-06, "loss": 0.0069, "reward": 1.0431562662124634, "reward_std": 0.628276526927948, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.265625, "rewards/xmlcount_reward_func": 0.3712812662124634, "step": 762 }, { "completion_length": 137.71875, "epoch": 0.40823970037453183, "grad_norm": 1.4579451084136963, "kl": 0.13879981637001038, "learning_rate": 4.980255528458179e-06, "loss": 0.0056, "reward": 1.660406231880188, "reward_std": 0.9641813635826111, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.457281231880188, "step": 763 }, { "completion_length": 143.625, "epoch": 0.4087747458533975, "grad_norm": 3.247462272644043, "kl": 0.23632556200027466, "learning_rate": 4.980059815686511e-06, "loss": 0.0095, "reward": 2.1846251487731934, "reward_std": 1.0084209442138672, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4502499997615814, "step": 764 }, { "completion_length": 148.59375, "epoch": 0.40930979133226325, "grad_norm": 1.5812482833862305, "kl": 0.16410955786705017, "learning_rate": 4.979863141596135e-06, "loss": 0.0066, "reward": 1.7357499599456787, "reward_std": 1.034159541130066, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4232499897480011, "step": 765 }, { "completion_length": 175.0625, "epoch": 0.40984483681112893, "grad_norm": 0.7468569278717041, "kl": 0.11173798143863678, "learning_rate": 4.979665506263285e-06, "loss": 0.0045, "reward": 1.475562572479248, "reward_std": 0.8967955112457275, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.265625, "rewards/xmlcount_reward_func": 0.36618751287460327, "step": 766 }, { "completion_length": 161.90625, "epoch": 0.41037988228999467, "grad_norm": 4.858072280883789, "kl": 0.19627657532691956, "learning_rate": 4.979466909764567e-06, "loss": 0.0079, "reward": 1.495437502861023, "reward_std": 0.6636543273925781, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.41731250286102295, "step": 767 }, { "completion_length": 137.4375, "epoch": 0.41091492776886035, "grad_norm": 1.0792325735092163, "kl": 0.15953058004379272, "learning_rate": 4.979267352176962e-06, "loss": 0.0064, "reward": 2.41015625, "reward_std": 0.8682399392127991, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.44140625, "step": 768 }, { "completion_length": 124.34375, "epoch": 0.41144997324772603, "grad_norm": 0.6798334717750549, "kl": 0.16005147993564606, "learning_rate": 4.979066833577823e-06, "loss": 0.0064, "reward": 1.73828125, "reward_std": 0.9750863909721375, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.47265625, "step": 769 }, { "completion_length": 152.125, "epoch": 0.41198501872659177, "grad_norm": 0.9832243323326111, "kl": 0.1650640070438385, "learning_rate": 4.978865354044873e-06, "loss": 0.0066, "reward": 1.2565937042236328, "reward_std": 0.6126863956451416, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.4128437638282776, "step": 770 }, { "completion_length": 146.5625, "epoch": 0.41252006420545745, "grad_norm": 729.3115234375, "kl": 3.272491216659546, "learning_rate": 4.97866291365621e-06, "loss": 0.1309, "reward": 1.5771875381469727, "reward_std": 1.1071627140045166, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.265625, "rewards/xmlcount_reward_func": 0.42093750834465027, "step": 771 }, { "completion_length": 147.71875, "epoch": 0.4130551096843232, "grad_norm": 1.3133302927017212, "kl": 0.2124113291501999, "learning_rate": 4.978459512490304e-06, "loss": 0.0085, "reward": 1.6650937795639038, "reward_std": 1.261139988899231, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.28125, "rewards/xmlcount_reward_func": 0.4307187497615814, "step": 772 }, { "completion_length": 141.46875, "epoch": 0.41359015516318887, "grad_norm": 1.573486566543579, "kl": 0.17919085919857025, "learning_rate": 4.9782551506259976e-06, "loss": 0.0072, "reward": 1.8038437366485596, "reward_std": 0.5996965169906616, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.42884376645088196, "step": 773 }, { "completion_length": 127.8125, "epoch": 0.41412520064205455, "grad_norm": 8634080256.0, "kl": 68797192.0, "learning_rate": 4.978049828142505e-06, "loss": 2751887.5, "reward": 2.0464375019073486, "reward_std": 0.843746542930603, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.49956250190734863, "step": 774 }, { "completion_length": 149.84375, "epoch": 0.4146602461209203, "grad_norm": 2.5647783279418945, "kl": 0.23976314067840576, "learning_rate": 4.977843545119413e-06, "loss": 0.0096, "reward": 1.44140625, "reward_std": 0.6967226266860962, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.44140625, "step": 775 }, { "completion_length": 163.90625, "epoch": 0.41519529159978596, "grad_norm": 1.7657850980758667, "kl": 0.14700783789157867, "learning_rate": 4.977636301636681e-06, "loss": 0.0059, "reward": 1.5553749799728394, "reward_std": 0.9594166278839111, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.41475000977516174, "step": 776 }, { "completion_length": 130.09375, "epoch": 0.4157303370786517, "grad_norm": 1.1382274627685547, "kl": 0.24026134610176086, "learning_rate": 4.97742809777464e-06, "loss": 0.0096, "reward": 1.4919999837875366, "reward_std": 0.7301881313323975, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4763749837875366, "step": 777 }, { "completion_length": 143.8125, "epoch": 0.4162653825575174, "grad_norm": 134.18038940429688, "kl": 2.5782015323638916, "learning_rate": 4.977218933613995e-06, "loss": 0.1031, "reward": 0.9110312461853027, "reward_std": 0.5155492424964905, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.265625, "rewards/xmlcount_reward_func": 0.39540624618530273, "step": 778 }, { "completion_length": 131.0, "epoch": 0.4168004280363831, "grad_norm": 1.990918755531311, "kl": 0.20458152890205383, "learning_rate": 4.977008809235821e-06, "loss": 0.0082, "reward": 2.3874688148498535, "reward_std": 1.0671312808990479, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48121875524520874, "step": 779 }, { "completion_length": 127.03125, "epoch": 0.4173354735152488, "grad_norm": 0.8254163265228271, "kl": 0.18217384815216064, "learning_rate": 4.976797724721567e-06, "loss": 0.0073, "reward": 1.8441250324249268, "reward_std": 0.9837591648101807, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.45350000262260437, "step": 780 }, { "completion_length": 135.1875, "epoch": 0.4178705189941145, "grad_norm": 7.144525527954102, "kl": 0.18535178899765015, "learning_rate": 4.976585680153052e-06, "loss": 0.0074, "reward": 1.9342188835144043, "reward_std": 0.5357639193534851, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.44984376430511475, "step": 781 }, { "completion_length": 144.375, "epoch": 0.4184055644729802, "grad_norm": 0.8421356678009033, "kl": 0.1450091004371643, "learning_rate": 4.9763726756124705e-06, "loss": 0.0058, "reward": 1.6470000743865967, "reward_std": 0.6993622183799744, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4751250147819519, "step": 782 }, { "completion_length": 143.875, "epoch": 0.4189406099518459, "grad_norm": 0.5547642707824707, "kl": 0.14694231748580933, "learning_rate": 4.976158711182386e-06, "loss": 0.0059, "reward": 1.5175312757492065, "reward_std": 1.0490076541900635, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.40815624594688416, "step": 783 }, { "completion_length": 145.15625, "epoch": 0.41947565543071164, "grad_norm": 394124928.0, "kl": 8651963.0, "learning_rate": 4.975943786945735e-06, "loss": 346078.5312, "reward": 1.8767499923706055, "reward_std": 1.0614882707595825, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.42362499237060547, "step": 784 }, { "completion_length": 141.03125, "epoch": 0.4200107009095773, "grad_norm": 1.1166108846664429, "kl": 0.1668730527162552, "learning_rate": 4.975727902985827e-06, "loss": 0.0067, "reward": 1.7845938205718994, "reward_std": 1.0514073371887207, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.42521876096725464, "step": 785 }, { "completion_length": 150.1875, "epoch": 0.420545746388443, "grad_norm": 1.0005005598068237, "kl": 0.12211659550666809, "learning_rate": 4.975511059386342e-06, "loss": 0.0049, "reward": 1.1299999952316284, "reward_std": 0.655622124671936, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.3956249952316284, "step": 786 }, { "completion_length": 121.75, "epoch": 0.42108079186730873, "grad_norm": 2.6984634399414062, "kl": 0.202499121427536, "learning_rate": 4.975293256231334e-06, "loss": 0.0081, "reward": 2.354249954223633, "reward_std": 0.9603708982467651, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4792500138282776, "step": 787 }, { "completion_length": 132.03125, "epoch": 0.4216158373461744, "grad_norm": 2.3648459911346436, "kl": 0.2371796816587448, "learning_rate": 4.975074493605225e-06, "loss": 0.0095, "reward": 1.419812560081482, "reward_std": 0.5511401295661926, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48231250047683716, "step": 788 }, { "completion_length": 143.5625, "epoch": 0.42215088282504015, "grad_norm": 0.892487645149231, "kl": 0.1929926723241806, "learning_rate": 4.9748547715928154e-06, "loss": 0.0077, "reward": 1.167718768119812, "reward_std": 0.49147897958755493, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.265625, "rewards/xmlcount_reward_func": 0.464593768119812, "step": 789 }, { "completion_length": 131.71875, "epoch": 0.42268592830390583, "grad_norm": 43174.62109375, "kl": 80.40558624267578, "learning_rate": 4.9746340902792706e-06, "loss": 3.2162, "reward": 2.131531238555908, "reward_std": 0.569999098777771, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4284062385559082, "step": 790 }, { "completion_length": 156.4375, "epoch": 0.4232209737827715, "grad_norm": 0.8279932737350464, "kl": 0.14281854033470154, "learning_rate": 4.974412449750132e-06, "loss": 0.0057, "reward": 2.003781318664551, "reward_std": 1.232346773147583, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.28125, "rewards/xmlcount_reward_func": 0.425656259059906, "step": 791 }, { "completion_length": 117.03125, "epoch": 0.42375601926163725, "grad_norm": 1.3173407316207886, "kl": 0.23774345219135284, "learning_rate": 4.974189850091312e-06, "loss": 0.0095, "reward": 2.51143741607666, "reward_std": 0.7870326042175293, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.4958125054836273, "step": 792 }, { "completion_length": 118.375, "epoch": 0.42429106474050293, "grad_norm": 2.1267294883728027, "kl": 0.3039867579936981, "learning_rate": 4.973966291389094e-06, "loss": 0.0122, "reward": 2.16225004196167, "reward_std": 0.7013484239578247, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.44349998235702515, "step": 793 }, { "completion_length": 117.40625, "epoch": 0.42482611021936867, "grad_norm": 1.2351768016815186, "kl": 0.2255229353904724, "learning_rate": 4.973741773730133e-06, "loss": 0.009, "reward": 2.0921874046325684, "reward_std": 0.9043206572532654, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.4671874940395355, "step": 794 }, { "completion_length": 130.90625, "epoch": 0.42536115569823435, "grad_norm": 2.0443737506866455, "kl": 0.15647858381271362, "learning_rate": 4.973516297201458e-06, "loss": 0.0063, "reward": 1.9358749389648438, "reward_std": 0.7864786982536316, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4827499985694885, "step": 795 }, { "completion_length": 134.6875, "epoch": 0.42589620117710003, "grad_norm": 4.760818004608154, "kl": 0.21603664755821228, "learning_rate": 4.973289861890467e-06, "loss": 0.0086, "reward": 1.6887812614440918, "reward_std": 0.6449176073074341, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.4700312614440918, "step": 796 }, { "completion_length": 123.15625, "epoch": 0.42643124665596577, "grad_norm": 2.4785075187683105, "kl": 0.23532119393348694, "learning_rate": 4.973062467884932e-06, "loss": 0.0094, "reward": 2.715343952178955, "reward_std": 0.44996050000190735, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48096877336502075, "step": 797 }, { "completion_length": 127.375, "epoch": 0.42696629213483145, "grad_norm": 0.8289466500282288, "kl": 0.18557175993919373, "learning_rate": 4.972834115272992e-06, "loss": 0.0074, "reward": 2.8115625381469727, "reward_std": 0.394386887550354, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.49906250834465027, "step": 798 }, { "completion_length": 165.3125, "epoch": 0.4275013376136972, "grad_norm": 0.5100564360618591, "kl": 0.12169687449932098, "learning_rate": 4.972604804143164e-06, "loss": 0.0049, "reward": 1.2864375114440918, "reward_std": 0.9010622501373291, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.3801875114440918, "step": 799 }, { "completion_length": 135.53125, "epoch": 0.42803638309256287, "grad_norm": 1.3002461194992065, "kl": 0.16570459306240082, "learning_rate": 4.972374534584332e-06, "loss": 0.0066, "reward": 2.075000047683716, "reward_std": 1.0274608135223389, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.43437501788139343, "step": 800 }, { "completion_length": 134.4375, "epoch": 0.42857142857142855, "grad_norm": 2.762803077697754, "kl": 0.17255641520023346, "learning_rate": 4.972143306685754e-06, "loss": 0.0069, "reward": 1.9039688110351562, "reward_std": 1.0282288789749146, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4508437514305115, "step": 801 }, { "completion_length": 145.34375, "epoch": 0.4291064740502943, "grad_norm": 310.3126220703125, "kl": 15.112701416015625, "learning_rate": 4.971911120537057e-06, "loss": 0.6045, "reward": 1.3095625638961792, "reward_std": 0.6127331852912903, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.203125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.4189375042915344, "step": 802 }, { "completion_length": 161.84375, "epoch": 0.42964151952915997, "grad_norm": 1.2297476530075073, "kl": 0.10746116191148758, "learning_rate": 4.971677976228242e-06, "loss": 0.0043, "reward": 1.386812448501587, "reward_std": 0.9783912897109985, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.3868124783039093, "step": 803 }, { "completion_length": 133.25, "epoch": 0.4301765650080257, "grad_norm": 1.5013593435287476, "kl": 0.19355422258377075, "learning_rate": 4.971443873849679e-06, "loss": 0.0077, "reward": 1.7967500686645508, "reward_std": 0.7500208020210266, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.453000009059906, "step": 804 }, { "completion_length": 135.96875, "epoch": 0.4307116104868914, "grad_norm": 50129772.0, "kl": 643390.0, "learning_rate": 4.971208813492111e-06, "loss": 25735.6016, "reward": 1.984375, "reward_std": 0.9681955575942993, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.5, "step": 805 }, { "completion_length": 100.625, "epoch": 0.43124665596575706, "grad_norm": 0.5799117088317871, "kl": 0.21708297729492188, "learning_rate": 4.970972795246653e-06, "loss": 0.0087, "reward": 2.343625068664551, "reward_std": 0.5508708953857422, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.499875009059906, "step": 806 }, { "completion_length": 124.59375, "epoch": 0.4317817014446228, "grad_norm": 5.691243648529053, "kl": 0.43813562393188477, "learning_rate": 4.970735819204788e-06, "loss": 0.0175, "reward": 1.6639375686645508, "reward_std": 0.7149986028671265, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.476437509059906, "step": 807 }, { "completion_length": 131.34375, "epoch": 0.4323167469234885, "grad_norm": 0.9546801447868347, "kl": 0.12969215214252472, "learning_rate": 4.970497885458374e-06, "loss": 0.0052, "reward": 2.4951562881469727, "reward_std": 1.193326473236084, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.47953125834465027, "step": 808 }, { "completion_length": 153.75, "epoch": 0.4328517924023542, "grad_norm": 700692.3125, "kl": 25702.3671875, "learning_rate": 4.97025899409964e-06, "loss": 1028.0947, "reward": 1.2801562547683716, "reward_std": 0.592929482460022, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4207812547683716, "step": 809 }, { "completion_length": 137.25, "epoch": 0.4333868378812199, "grad_norm": 1.379038691520691, "kl": 0.18592828512191772, "learning_rate": 4.970019145221181e-06, "loss": 0.0074, "reward": 1.6494687795639038, "reward_std": 0.8906005024909973, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.4775937497615814, "step": 810 }, { "completion_length": 129.21875, "epoch": 0.4339218833600856, "grad_norm": 0.8785649538040161, "kl": 0.18318597972393036, "learning_rate": 4.969778338915971e-06, "loss": 0.0073, "reward": 1.80078125, "reward_std": 0.5503396391868591, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.48828125, "step": 811 }, { "completion_length": 147.71875, "epoch": 0.4344569288389513, "grad_norm": 1.0622199773788452, "kl": 0.12913654744625092, "learning_rate": 4.969536575277348e-06, "loss": 0.0052, "reward": 1.461093783378601, "reward_std": 0.633990466594696, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4142187535762787, "step": 812 }, { "completion_length": 155.40625, "epoch": 0.434991974317817, "grad_norm": 1.2283114194869995, "kl": 0.17295408248901367, "learning_rate": 4.969293854399026e-06, "loss": 0.0069, "reward": 1.4341249465942383, "reward_std": 0.7883819341659546, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.37162500619888306, "step": 813 }, { "completion_length": 146.53125, "epoch": 0.43552701979668274, "grad_norm": 1.6954481601715088, "kl": 0.2349030077457428, "learning_rate": 4.969050176375089e-06, "loss": 0.0094, "reward": 1.6130625009536743, "reward_std": 0.7648799419403076, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4724375009536743, "step": 814 }, { "completion_length": 134.40625, "epoch": 0.4360620652755484, "grad_norm": 0.7445634603500366, "kl": 0.1402065008878708, "learning_rate": 4.96880554129999e-06, "loss": 0.0056, "reward": 1.2922186851501465, "reward_std": 0.5770736336708069, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.44846874475479126, "step": 815 }, { "completion_length": 125.9375, "epoch": 0.43659711075441415, "grad_norm": 2.7879321575164795, "kl": 0.24967259168624878, "learning_rate": 4.968559949268553e-06, "loss": 0.01, "reward": 1.5875312089920044, "reward_std": 0.7530714273452759, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4781562387943268, "step": 816 }, { "completion_length": 118.03125, "epoch": 0.43713215623327983, "grad_norm": 1.173790693283081, "kl": 0.18336525559425354, "learning_rate": 4.968313400375977e-06, "loss": 0.0073, "reward": 2.2485313415527344, "reward_std": 0.8528735041618347, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4829062521457672, "step": 817 }, { "completion_length": 155.03125, "epoch": 0.4376672017121455, "grad_norm": 1.4260450601577759, "kl": 0.17565098404884338, "learning_rate": 4.9680658947178275e-06, "loss": 0.007, "reward": 2.071500062942505, "reward_std": 0.9938442707061768, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.4465000033378601, "step": 818 }, { "completion_length": 145.0625, "epoch": 0.43820224719101125, "grad_norm": 2.3781239986419678, "kl": 0.17742598056793213, "learning_rate": 4.9678174323900415e-06, "loss": 0.0071, "reward": 1.15234375, "reward_std": 0.5837881565093994, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.21875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.43359375, "step": 819 }, { "completion_length": 147.1875, "epoch": 0.43873729266987693, "grad_norm": 1.5269430875778198, "kl": 0.16029156744480133, "learning_rate": 4.967568013488931e-06, "loss": 0.0064, "reward": 1.4878437519073486, "reward_std": 1.025946021080017, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.40971875190734863, "step": 820 }, { "completion_length": 163.65625, "epoch": 0.43927233814874267, "grad_norm": 0.8488768935203552, "kl": 0.10442274063825607, "learning_rate": 4.967317638111172e-06, "loss": 0.0042, "reward": 1.2181875705718994, "reward_std": 0.8763999342918396, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.39006251096725464, "step": 821 }, { "completion_length": 126.875, "epoch": 0.43980738362760835, "grad_norm": 2.6406044960021973, "kl": 0.17878447473049164, "learning_rate": 4.967066306353816e-06, "loss": 0.0072, "reward": 2.2833750247955322, "reward_std": 0.7095790505409241, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.47087499499320984, "step": 822 }, { "completion_length": 138.34375, "epoch": 0.44034242910647403, "grad_norm": 1.465201735496521, "kl": 0.19643355906009674, "learning_rate": 4.966814018314284e-06, "loss": 0.0079, "reward": 1.5104999542236328, "reward_std": 0.700171947479248, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.4323749840259552, "step": 823 }, { "completion_length": 122.5625, "epoch": 0.44087747458533977, "grad_norm": 1.5473872423171997, "kl": 0.1858138144016266, "learning_rate": 4.9665607740903685e-06, "loss": 0.0074, "reward": 2.7840938568115234, "reward_std": 0.9478923082351685, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4872187674045563, "step": 824 }, { "completion_length": 150.9375, "epoch": 0.44141252006420545, "grad_norm": 1.1880416870117188, "kl": 0.21678206324577332, "learning_rate": 4.966306573780232e-06, "loss": 0.0087, "reward": 1.8955937623977661, "reward_std": 0.5235259532928467, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4424687623977661, "step": 825 }, { "completion_length": 128.375, "epoch": 0.4419475655430712, "grad_norm": 4.745180606842041, "kl": 0.15774862468242645, "learning_rate": 4.966051417482405e-06, "loss": 0.0063, "reward": 1.5026874542236328, "reward_std": 0.6844440698623657, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4714375138282776, "step": 826 }, { "completion_length": 134.21875, "epoch": 0.44248261102193687, "grad_norm": 1.554694414138794, "kl": 0.197069451212883, "learning_rate": 4.965795305295793e-06, "loss": 0.0079, "reward": 1.7767187356948853, "reward_std": 0.8925039768218994, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.47984373569488525, "step": 827 }, { "completion_length": 114.25, "epoch": 0.44301765650080255, "grad_norm": 1874.23486328125, "kl": 3.2672948837280273, "learning_rate": 4.96553823731967e-06, "loss": 0.1307, "reward": 1.2636250257492065, "reward_std": 0.5562076568603516, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.46675002574920654, "step": 828 }, { "completion_length": 148.40625, "epoch": 0.4435527019796683, "grad_norm": 0.8587542176246643, "kl": 0.1423155814409256, "learning_rate": 4.9652802136536806e-06, "loss": 0.0057, "reward": 1.0696874856948853, "reward_std": 0.6355422735214233, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.28125, "rewards/xmlcount_reward_func": 0.38218748569488525, "step": 829 }, { "completion_length": 130.3125, "epoch": 0.44408774745853397, "grad_norm": 2.90592885017395, "kl": 0.1953621506690979, "learning_rate": 4.965021234397839e-06, "loss": 0.0078, "reward": 1.5607812404632568, "reward_std": 0.9485610723495483, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.43578124046325684, "step": 830 }, { "completion_length": 149.5625, "epoch": 0.4446227929373997, "grad_norm": 1.449951410293579, "kl": 0.13952939212322235, "learning_rate": 4.964761299652529e-06, "loss": 0.0056, "reward": 1.3900938034057617, "reward_std": 0.9914252758026123, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.39009374380111694, "step": 831 }, { "completion_length": 115.96875, "epoch": 0.4451578384162654, "grad_norm": 1.036494493484497, "kl": 0.1944705843925476, "learning_rate": 4.96450040951851e-06, "loss": 0.0078, "reward": 1.5545313358306885, "reward_std": 0.4786369800567627, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4920312166213989, "step": 832 }, { "completion_length": 143.65625, "epoch": 0.44569288389513106, "grad_norm": 977.6521606445312, "kl": 2.347240924835205, "learning_rate": 4.964238564096905e-06, "loss": 0.0939, "reward": 1.1654374599456787, "reward_std": 0.6181628108024597, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.4466874897480011, "step": 833 }, { "completion_length": 140.09375, "epoch": 0.4462279293739968, "grad_norm": 0.9063860177993774, "kl": 0.1700434684753418, "learning_rate": 4.963975763489212e-06, "loss": 0.0068, "reward": 1.8125, "reward_std": 0.9809916615486145, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.453125, "step": 834 }, { "completion_length": 138.65625, "epoch": 0.4467629748528625, "grad_norm": 2.2715094089508057, "kl": 0.17137715220451355, "learning_rate": 4.9637120077972965e-06, "loss": 0.0069, "reward": 1.4821875095367432, "reward_std": 0.8524900078773499, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.45093750953674316, "step": 835 }, { "completion_length": 105.34375, "epoch": 0.4472980203317282, "grad_norm": 1.4519892930984497, "kl": 0.21704649925231934, "learning_rate": 4.9634472971233955e-06, "loss": 0.0087, "reward": 2.015500068664551, "reward_std": 0.7595565319061279, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.499875009059906, "step": 836 }, { "completion_length": 144.90625, "epoch": 0.4478330658105939, "grad_norm": 1.0556985139846802, "kl": 0.14911115169525146, "learning_rate": 4.963181631570117e-06, "loss": 0.006, "reward": 1.746000051498413, "reward_std": 1.2840251922607422, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.4022499918937683, "step": 837 }, { "completion_length": 139.4375, "epoch": 0.4483681112894596, "grad_norm": 0.7137891054153442, "kl": 0.15247061848640442, "learning_rate": 4.962915011240435e-06, "loss": 0.0061, "reward": 2.0531249046325684, "reward_std": 1.0257244110107422, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4749999940395355, "step": 838 }, { "completion_length": 134.0625, "epoch": 0.4489031567683253, "grad_norm": 1.4453493356704712, "kl": 0.1466670036315918, "learning_rate": 4.962647436237701e-06, "loss": 0.0059, "reward": 1.9284374713897705, "reward_std": 0.7156780362129211, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4284375011920929, "step": 839 }, { "completion_length": 141.5, "epoch": 0.449438202247191, "grad_norm": 3.3537259101867676, "kl": 0.1594792753458023, "learning_rate": 4.962378906665628e-06, "loss": 0.0064, "reward": 1.625, "reward_std": 0.8883334994316101, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.40625, "step": 840 }, { "completion_length": 156.5, "epoch": 0.44997324772605674, "grad_norm": 0.712562620639801, "kl": 0.12768465280532837, "learning_rate": 4.962109422628306e-06, "loss": 0.0051, "reward": 1.4972813129425049, "reward_std": 1.0773807764053345, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.28125, "rewards/xmlcount_reward_func": 0.3879062533378601, "step": 841 }, { "completion_length": 133.46875, "epoch": 0.4505082932049224, "grad_norm": 8.094996452331543, "kl": 0.3012368977069855, "learning_rate": 4.961838984230192e-06, "loss": 0.012, "reward": 1.2675625085830688, "reward_std": 0.5194171071052551, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47068750858306885, "step": 842 }, { "completion_length": 126.125, "epoch": 0.4510433386837881, "grad_norm": 1.5164119005203247, "kl": 0.3192216753959656, "learning_rate": 4.961567591576112e-06, "loss": 0.0128, "reward": 2.0375938415527344, "reward_std": 1.1296660900115967, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.4438437521457672, "step": 843 }, { "completion_length": 121.875, "epoch": 0.45157838416265383, "grad_norm": 3.955918312072754, "kl": 0.25972843170166016, "learning_rate": 4.961295244771263e-06, "loss": 0.0104, "reward": 1.8767187595367432, "reward_std": 0.9176319241523743, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.47046875953674316, "step": 844 }, { "completion_length": 149.9375, "epoch": 0.4521134296415195, "grad_norm": 1.7792648077011108, "kl": 0.12856483459472656, "learning_rate": 4.961021943921213e-06, "loss": 0.0051, "reward": 1.6711561679840088, "reward_std": 1.081146240234375, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.40553125739097595, "step": 845 }, { "completion_length": 142.78125, "epoch": 0.45264847512038525, "grad_norm": 1.6461435556411743, "kl": 0.17593184113502502, "learning_rate": 4.960747689131897e-06, "loss": 0.007, "reward": 1.7075936794281006, "reward_std": 1.023246169090271, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.44196873903274536, "step": 846 }, { "completion_length": 151.84375, "epoch": 0.45318352059925093, "grad_norm": 1.9349684715270996, "kl": 0.15177926421165466, "learning_rate": 4.960472480509623e-06, "loss": 0.0061, "reward": 2.2758126258850098, "reward_std": 0.8195909261703491, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4633125066757202, "step": 847 }, { "completion_length": 141.0, "epoch": 0.4537185660781166, "grad_norm": 2.4733150005340576, "kl": 0.14910373091697693, "learning_rate": 4.960196318161067e-06, "loss": 0.006, "reward": 1.3905937671661377, "reward_std": 0.832597553730011, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4062187671661377, "step": 848 }, { "completion_length": 123.5625, "epoch": 0.45425361155698235, "grad_norm": 0.9710436463356018, "kl": 0.16711319983005524, "learning_rate": 4.959919202193273e-06, "loss": 0.0067, "reward": 2.2320001125335693, "reward_std": 0.7951905131340027, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4819999933242798, "step": 849 }, { "completion_length": 138.8125, "epoch": 0.45478865703584803, "grad_norm": 1.7468814849853516, "kl": 0.1693132072687149, "learning_rate": 4.959641132713659e-06, "loss": 0.0068, "reward": 2.0547499656677246, "reward_std": 0.8594829440116882, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.460999995470047, "step": 850 }, { "completion_length": 158.875, "epoch": 0.45532370251471377, "grad_norm": 19.546751022338867, "kl": 0.15554198622703552, "learning_rate": 4.959362109830007e-06, "loss": 0.0062, "reward": 1.4921875, "reward_std": 0.7677806615829468, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4453125, "step": 851 }, { "completion_length": 126.3125, "epoch": 0.45585874799357945, "grad_norm": 0.9998789429664612, "kl": 0.19562608003616333, "learning_rate": 4.959082133650475e-06, "loss": 0.0078, "reward": 1.8615000247955322, "reward_std": 0.6337116956710815, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.47087499499320984, "step": 852 }, { "completion_length": 121.84375, "epoch": 0.45639379347244513, "grad_norm": 0.8177929520606995, "kl": 0.21693910658359528, "learning_rate": 4.958801204283585e-06, "loss": 0.0087, "reward": 2.3050312995910645, "reward_std": 0.7877055406570435, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4769062399864197, "step": 853 }, { "completion_length": 138.8125, "epoch": 0.45692883895131087, "grad_norm": 0.7858922481536865, "kl": 0.1715869903564453, "learning_rate": 4.958519321838231e-06, "loss": 0.0069, "reward": 1.5700312852859497, "reward_std": 0.740085244178772, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4762812554836273, "step": 854 }, { "completion_length": 139.84375, "epoch": 0.45746388443017655, "grad_norm": 1.4638384580612183, "kl": 0.14101850986480713, "learning_rate": 4.958236486423676e-06, "loss": 0.0056, "reward": 1.5078125, "reward_std": 0.9145639538764954, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4296875, "step": 855 }, { "completion_length": 126.0625, "epoch": 0.4579989299090423, "grad_norm": 1158425344.0, "kl": 114564648.0, "learning_rate": 4.957952698149554e-06, "loss": 4582586.0, "reward": 2.1231250762939453, "reward_std": 0.720289945602417, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.48250001668930054, "step": 856 }, { "completion_length": 131.9375, "epoch": 0.45853397538790797, "grad_norm": 1.5479973554611206, "kl": 0.33680838346481323, "learning_rate": 4.9576679571258645e-06, "loss": 0.0135, "reward": 2.0078125, "reward_std": 0.6584897041320801, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4765625, "step": 857 }, { "completion_length": 139.78125, "epoch": 0.4590690208667737, "grad_norm": 1.215806007385254, "kl": 0.13599511981010437, "learning_rate": 4.95738226346298e-06, "loss": 0.0054, "reward": 1.5267187356948853, "reward_std": 0.7731457948684692, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46421873569488525, "step": 858 }, { "completion_length": 120.5625, "epoch": 0.4596040663456394, "grad_norm": 1.4612455368041992, "kl": 0.19374634325504303, "learning_rate": 4.957095617271639e-06, "loss": 0.0077, "reward": 1.47265625, "reward_std": 0.4793778955936432, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.48828125, "step": 859 }, { "completion_length": 123.0, "epoch": 0.46013911182450506, "grad_norm": 0.9390439987182617, "kl": 0.16586022078990936, "learning_rate": 4.956808018662954e-06, "loss": 0.0066, "reward": 1.5129687786102295, "reward_std": 0.5367727875709534, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4660937488079071, "step": 860 }, { "completion_length": 123.59375, "epoch": 0.4606741573033708, "grad_norm": 28.3468017578125, "kl": 0.1387433111667633, "learning_rate": 4.9565194677484005e-06, "loss": 0.0055, "reward": 2.1857187747955322, "reward_std": 0.612657904624939, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.48259374499320984, "step": 861 }, { "completion_length": 142.25, "epoch": 0.4612092027822365, "grad_norm": 0.5489698052406311, "kl": 0.1292426884174347, "learning_rate": 4.956229964639829e-06, "loss": 0.0052, "reward": 1.6430312395095825, "reward_std": 0.769286036491394, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4711562395095825, "step": 862 }, { "completion_length": 145.78125, "epoch": 0.4617442482611022, "grad_norm": 1.2811851501464844, "kl": 0.1391110122203827, "learning_rate": 4.9559395094494555e-06, "loss": 0.0056, "reward": 1.684000015258789, "reward_std": 0.9232810139656067, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4183749854564667, "step": 863 }, { "completion_length": 102.59375, "epoch": 0.4622792937399679, "grad_norm": 1.3277021646499634, "kl": 0.30299946665763855, "learning_rate": 4.9556481022898655e-06, "loss": 0.0121, "reward": 2.5331873893737793, "reward_std": 0.8041226863861084, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48631250858306885, "step": 864 }, { "completion_length": 121.03125, "epoch": 0.4628143392188336, "grad_norm": 1.129193663597107, "kl": 0.13050663471221924, "learning_rate": 4.955355743274014e-06, "loss": 0.0052, "reward": 1.6861562728881836, "reward_std": 0.8243085145950317, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4674062728881836, "step": 865 }, { "completion_length": 116.96875, "epoch": 0.4633493846976993, "grad_norm": 0.7117172479629517, "kl": 0.1561349630355835, "learning_rate": 4.955062432515227e-06, "loss": 0.0062, "reward": 2.1948437690734863, "reward_std": 0.8156503438949585, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46046873927116394, "step": 866 }, { "completion_length": 130.0, "epoch": 0.463884430176565, "grad_norm": 1.0738945007324219, "kl": 0.13762183487415314, "learning_rate": 4.954768170127195e-06, "loss": 0.0055, "reward": 1.7335312366485596, "reward_std": 0.5467654466629028, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48353126645088196, "step": 867 }, { "completion_length": 142.15625, "epoch": 0.46441947565543074, "grad_norm": 1.2093340158462524, "kl": 0.15679848194122314, "learning_rate": 4.954472956223979e-06, "loss": 0.0063, "reward": 1.5894687175750732, "reward_std": 0.7301170825958252, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.28125, "rewards/xmlcount_reward_func": 0.38634374737739563, "step": 868 }, { "completion_length": 140.71875, "epoch": 0.4649545211342964, "grad_norm": 0.9394309520721436, "kl": 0.1393403708934784, "learning_rate": 4.954176790920012e-06, "loss": 0.0056, "reward": 2.1959686279296875, "reward_std": 0.9133194088935852, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.44596874713897705, "step": 869 }, { "completion_length": 159.90625, "epoch": 0.4654895666131621, "grad_norm": 0.8987517952919006, "kl": 0.1579708456993103, "learning_rate": 4.953879674330093e-06, "loss": 0.0063, "reward": 1.5159374475479126, "reward_std": 1.0286957025527954, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.4065625071525574, "step": 870 }, { "completion_length": 140.625, "epoch": 0.46602461209202783, "grad_norm": 8.488075256347656, "kl": 0.15029500424861908, "learning_rate": 4.953581606569389e-06, "loss": 0.006, "reward": 1.6322187185287476, "reward_std": 0.807526707649231, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46034374833106995, "step": 871 }, { "completion_length": 141.65625, "epoch": 0.4665596575708935, "grad_norm": 2.4035377502441406, "kl": 0.4269987642765045, "learning_rate": 4.953282587753438e-06, "loss": 0.0171, "reward": 1.8507812023162842, "reward_std": 1.0132473707199097, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.41328126192092896, "step": 872 }, { "completion_length": 132.9375, "epoch": 0.46709470304975925, "grad_norm": 10.330764770507812, "kl": 0.3164811134338379, "learning_rate": 4.952982617998143e-06, "loss": 0.0127, "reward": 1.2303438186645508, "reward_std": 0.8291767835617065, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.464718759059906, "step": 873 }, { "completion_length": 119.28125, "epoch": 0.46762974852862493, "grad_norm": 1.1203900575637817, "kl": 0.16531209647655487, "learning_rate": 4.952681697419781e-06, "loss": 0.0066, "reward": 2.800656318664551, "reward_std": 0.9997076392173767, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.488156259059906, "step": 874 }, { "completion_length": 132.9375, "epoch": 0.4681647940074906, "grad_norm": 1.0336828231811523, "kl": 0.16182027757167816, "learning_rate": 4.952379826134993e-06, "loss": 0.0065, "reward": 1.4393436908721924, "reward_std": 0.5704683065414429, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.45496875047683716, "step": 875 }, { "completion_length": 151.75, "epoch": 0.46869983948635635, "grad_norm": 1.1111910343170166, "kl": 0.14752840995788574, "learning_rate": 4.95207700426079e-06, "loss": 0.0059, "reward": 1.7548437118530273, "reward_std": 1.0629451274871826, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.42671874165534973, "step": 876 }, { "completion_length": 145.03125, "epoch": 0.46923488496522203, "grad_norm": 1.0469400882720947, "kl": 0.13087472319602966, "learning_rate": 4.951773231914553e-06, "loss": 0.0052, "reward": 1.5234375, "reward_std": 0.9934717416763306, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.4296875, "step": 877 }, { "completion_length": 135.96875, "epoch": 0.46976993044408777, "grad_norm": 1.6003727912902832, "kl": 0.21891731023788452, "learning_rate": 4.951468509214028e-06, "loss": 0.0088, "reward": 2.1878437995910645, "reward_std": 0.9960200190544128, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4534687399864197, "step": 878 }, { "completion_length": 140.09375, "epoch": 0.47030497592295345, "grad_norm": 1.51548433303833, "kl": 0.14079315960407257, "learning_rate": 4.9511628362773326e-06, "loss": 0.0056, "reward": 1.8935312032699585, "reward_std": 0.9226844310760498, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.45603126287460327, "step": 879 }, { "completion_length": 129.40625, "epoch": 0.47084002140181913, "grad_norm": 0.7993934154510498, "kl": 0.17568835616111755, "learning_rate": 4.950856213222951e-06, "loss": 0.007, "reward": 1.8744688034057617, "reward_std": 1.0593181848526, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.46821874380111694, "step": 880 }, { "completion_length": 125.8125, "epoch": 0.47137506688068487, "grad_norm": 1.2375811338424683, "kl": 0.14002850651741028, "learning_rate": 4.950548640169737e-06, "loss": 0.0056, "reward": 2.1014063358306885, "reward_std": 0.7972573041915894, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4607812464237213, "step": 881 }, { "completion_length": 144.65625, "epoch": 0.47191011235955055, "grad_norm": 1.0600438117980957, "kl": 0.1923241913318634, "learning_rate": 4.950240117236909e-06, "loss": 0.0077, "reward": 1.2444374561309814, "reward_std": 0.5937182903289795, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.44756248593330383, "step": 882 }, { "completion_length": 117.6875, "epoch": 0.4724451578384163, "grad_norm": 91.11113739013672, "kl": 1.8555890321731567, "learning_rate": 4.949930644544061e-06, "loss": 0.0742, "reward": 2.505687713623047, "reward_std": 0.8243575096130371, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4900624752044678, "step": 883 }, { "completion_length": 116.46875, "epoch": 0.47298020331728197, "grad_norm": 2.1816697120666504, "kl": 0.17067237198352814, "learning_rate": 4.949620222211147e-06, "loss": 0.0068, "reward": 2.1484375, "reward_std": 0.9866080284118652, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4765625, "step": 884 }, { "completion_length": 129.0625, "epoch": 0.47351524879614765, "grad_norm": 672980096.0, "kl": 90830992.0, "learning_rate": 4.949308850358494e-06, "loss": 3633239.5, "reward": 1.7222812175750732, "reward_std": 0.8174241781234741, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.47228124737739563, "step": 885 }, { "completion_length": 155.28125, "epoch": 0.4740502942750134, "grad_norm": 2.127235174179077, "kl": 0.1976456195116043, "learning_rate": 4.948996529106796e-06, "loss": 0.0079, "reward": 2.076906204223633, "reward_std": 1.0404387712478638, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4675312638282776, "step": 886 }, { "completion_length": 128.84375, "epoch": 0.47458533975387907, "grad_norm": 11.707980155944824, "kl": 0.34203940629959106, "learning_rate": 4.948683258577114e-06, "loss": 0.0137, "reward": 2.2069687843322754, "reward_std": 0.9531989693641663, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.488218754529953, "step": 887 }, { "completion_length": 116.25, "epoch": 0.4751203852327448, "grad_norm": 7943.7900390625, "kl": 28.143192291259766, "learning_rate": 4.948369038890877e-06, "loss": 1.1257, "reward": 2.582656145095825, "reward_std": 1.024734616279602, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.48890623450279236, "step": 888 }, { "completion_length": 157.8125, "epoch": 0.4756554307116105, "grad_norm": 0.8414778113365173, "kl": 0.14291182160377502, "learning_rate": 4.948053870169884e-06, "loss": 0.0057, "reward": 1.2533124685287476, "reward_std": 0.6803672909736633, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.42518749833106995, "step": 889 }, { "completion_length": 139.3125, "epoch": 0.47619047619047616, "grad_norm": 0.5133932828903198, "kl": 0.14098069071769714, "learning_rate": 4.947737752536299e-06, "loss": 0.0056, "reward": 1.563156247138977, "reward_std": 0.6673469543457031, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46940624713897705, "step": 890 }, { "completion_length": 120.15625, "epoch": 0.4767255216693419, "grad_norm": 2.1623785495758057, "kl": 0.22137954831123352, "learning_rate": 4.947420686112657e-06, "loss": 0.0089, "reward": 2.0218124389648438, "reward_std": 0.963318943977356, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4905624985694885, "step": 891 }, { "completion_length": 144.21875, "epoch": 0.4772605671482076, "grad_norm": 2230314.75, "kl": 67127.3359375, "learning_rate": 4.947102671021858e-06, "loss": 2685.0935, "reward": 1.95703125, "reward_std": 0.8658429980278015, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48828125, "step": 892 }, { "completion_length": 142.71875, "epoch": 0.4777956126270733, "grad_norm": 2.2260241508483887, "kl": 0.33584123849868774, "learning_rate": 4.94678370738717e-06, "loss": 0.0134, "reward": 1.948062539100647, "reward_std": 0.912794828414917, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4793125092983246, "step": 893 }, { "completion_length": 124.40625, "epoch": 0.478330658105939, "grad_norm": 2.2902846336364746, "kl": 0.20066726207733154, "learning_rate": 4.946463795332232e-06, "loss": 0.008, "reward": 1.685156226158142, "reward_std": 1.1948421001434326, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4507812559604645, "step": 894 }, { "completion_length": 119.625, "epoch": 0.47886570358480474, "grad_norm": 1.0164430141448975, "kl": 0.18999570608139038, "learning_rate": 4.946142934981044e-06, "loss": 0.0076, "reward": 2.576437473297119, "reward_std": 0.6366913318634033, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.49831250309944153, "step": 895 }, { "completion_length": 130.3125, "epoch": 0.4794007490636704, "grad_norm": 2.329267740249634, "kl": 0.19951178133487701, "learning_rate": 4.945821126457981e-06, "loss": 0.008, "reward": 1.997093677520752, "reward_std": 0.7556289434432983, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48146873712539673, "step": 896 }, { "completion_length": 135.5, "epoch": 0.4799357945425361, "grad_norm": 0.7179273366928101, "kl": 0.1413559913635254, "learning_rate": 4.945498369887781e-06, "loss": 0.0057, "reward": 1.930375099182129, "reward_std": 0.9686849117279053, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46162497997283936, "step": 897 }, { "completion_length": 126.09375, "epoch": 0.48047084002140183, "grad_norm": 2.800442934036255, "kl": 0.2242724448442459, "learning_rate": 4.945174665395551e-06, "loss": 0.009, "reward": 1.989687442779541, "reward_std": 0.6736688613891602, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4896875023841858, "step": 898 }, { "completion_length": 142.6875, "epoch": 0.4810058855002675, "grad_norm": 3.3852555751800537, "kl": 0.21909600496292114, "learning_rate": 4.944850013106765e-06, "loss": 0.0088, "reward": 1.9296875, "reward_std": 0.9695327877998352, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4765625, "step": 899 }, { "completion_length": 141.25, "epoch": 0.48154093097913325, "grad_norm": 0.7186158895492554, "kl": 0.11529001593589783, "learning_rate": 4.944524413147264e-06, "loss": 0.0046, "reward": 2.0010626316070557, "reward_std": 0.9135605096817017, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4698125123977661, "step": 900 }, { "completion_length": 150.375, "epoch": 0.48207597645799893, "grad_norm": 0.5586996078491211, "kl": 0.14333003759384155, "learning_rate": 4.944197865643256e-06, "loss": 0.0057, "reward": 2.0039374828338623, "reward_std": 1.1191339492797852, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4414375126361847, "step": 901 }, { "completion_length": 156.21875, "epoch": 0.4826110219368646, "grad_norm": 1.319678783416748, "kl": 0.14591002464294434, "learning_rate": 4.943870370721319e-06, "loss": 0.0058, "reward": 1.4121249914169312, "reward_std": 0.839949369430542, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.42774999141693115, "step": 902 }, { "completion_length": 153.0625, "epoch": 0.48314606741573035, "grad_norm": 12.478883743286133, "kl": 0.4169386625289917, "learning_rate": 4.943541928508395e-06, "loss": 0.0167, "reward": 1.3492499589920044, "reward_std": 1.0182526111602783, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.234375, "rewards/xmlcount_reward_func": 0.36487501859664917, "step": 903 }, { "completion_length": 128.59375, "epoch": 0.48368111289459603, "grad_norm": 1.8565694093704224, "kl": 0.1550792157649994, "learning_rate": 4.943212539131795e-06, "loss": 0.0062, "reward": 1.6558438539505005, "reward_std": 1.0099881887435913, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.42146873474121094, "step": 904 }, { "completion_length": 136.625, "epoch": 0.48421615837346177, "grad_norm": 3.3480236530303955, "kl": 0.14732399582862854, "learning_rate": 4.942882202719196e-06, "loss": 0.0059, "reward": 1.38671875, "reward_std": 0.5442227721214294, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.46484375, "step": 905 }, { "completion_length": 131.875, "epoch": 0.48475120385232745, "grad_norm": 0.4816553294658661, "kl": 0.1293412446975708, "learning_rate": 4.9425509193986445e-06, "loss": 0.0052, "reward": 2.128499984741211, "reward_std": 0.9699258804321289, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.47224998474121094, "step": 906 }, { "completion_length": 124.4375, "epoch": 0.48528624933119313, "grad_norm": 10.941743850708008, "kl": 0.39082589745521545, "learning_rate": 4.9422186892985505e-06, "loss": 0.0156, "reward": 2.5777499675750732, "reward_std": 0.6817602515220642, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.49962499737739563, "step": 907 }, { "completion_length": 133.5625, "epoch": 0.48582129481005887, "grad_norm": 0.49893704056739807, "kl": 0.12422239780426025, "learning_rate": 4.941885512547692e-06, "loss": 0.005, "reward": 1.7972187995910645, "reward_std": 0.6985809803009033, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4847187399864197, "step": 908 }, { "completion_length": 155.4375, "epoch": 0.48635634028892455, "grad_norm": 4.411563873291016, "kl": 0.24578650295734406, "learning_rate": 4.941551389275217e-06, "loss": 0.0098, "reward": 1.47265625, "reward_std": 0.7446355819702148, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.45703125, "step": 909 }, { "completion_length": 147.90625, "epoch": 0.4868913857677903, "grad_norm": 2.086714267730713, "kl": 0.17880025506019592, "learning_rate": 4.941216319610637e-06, "loss": 0.0072, "reward": 1.5851562023162842, "reward_std": 0.9413672685623169, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.41328126192092896, "step": 910 }, { "completion_length": 158.75, "epoch": 0.48742643124665597, "grad_norm": 1.9320812225341797, "kl": 0.15468597412109375, "learning_rate": 4.9408803036838315e-06, "loss": 0.0062, "reward": 1.2863749265670776, "reward_std": 1.0453195571899414, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.28125, "rewards/xmlcount_reward_func": 0.3488749861717224, "step": 911 }, { "completion_length": 142.3125, "epoch": 0.48796147672552165, "grad_norm": 1.6682684421539307, "kl": 0.1586349904537201, "learning_rate": 4.940543341625046e-06, "loss": 0.0063, "reward": 1.75390625, "reward_std": 0.43771666288375854, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48828125, "step": 912 }, { "completion_length": 135.96875, "epoch": 0.4884965222043874, "grad_norm": 3.130690097808838, "kl": 0.17272454500198364, "learning_rate": 4.940205433564895e-06, "loss": 0.0069, "reward": 1.4124062061309814, "reward_std": 0.4884790778160095, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4749062657356262, "step": 913 }, { "completion_length": 140.75, "epoch": 0.48903156768325307, "grad_norm": 1779447627776.0, "kl": 1627057792.0, "learning_rate": 4.9398665796343564e-06, "loss": 65082312.0, "reward": 1.4800000190734863, "reward_std": 0.8700904846191406, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.43312501907348633, "step": 914 }, { "completion_length": 140.53125, "epoch": 0.4895666131621188, "grad_norm": 2.45139479637146, "kl": 0.1389804184436798, "learning_rate": 4.939526779964778e-06, "loss": 0.0056, "reward": 1.799218773841858, "reward_std": 0.9095158576965332, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.28125, "rewards/xmlcount_reward_func": 0.4554687738418579, "step": 915 }, { "completion_length": 135.0, "epoch": 0.4901016586409845, "grad_norm": 2.5142316818237305, "kl": 0.14170968532562256, "learning_rate": 4.939186034687872e-06, "loss": 0.0057, "reward": 2.32603120803833, "reward_std": 1.0188815593719482, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48228126764297485, "step": 916 }, { "completion_length": 141.78125, "epoch": 0.49063670411985016, "grad_norm": 2.482349395751953, "kl": 0.1967138946056366, "learning_rate": 4.938844343935717e-06, "loss": 0.0079, "reward": 1.2081875801086426, "reward_std": 0.7622075080871582, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.4113124907016754, "step": 917 }, { "completion_length": 125.21875, "epoch": 0.4911717495987159, "grad_norm": 199.8750457763672, "kl": 3.351754665374756, "learning_rate": 4.9385017078407604e-06, "loss": 0.1341, "reward": 2.623624801635742, "reward_std": 0.8976134061813354, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4829999804496765, "step": 918 }, { "completion_length": 141.34375, "epoch": 0.4917067950775816, "grad_norm": 1.6148539781570435, "kl": 0.1598065197467804, "learning_rate": 4.938158126535814e-06, "loss": 0.0064, "reward": 1.7488125562667847, "reward_std": 0.7590832710266113, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4675624966621399, "step": 919 }, { "completion_length": 156.6875, "epoch": 0.4922418405564473, "grad_norm": 0.4114759862422943, "kl": 0.12810362875461578, "learning_rate": 4.937813600154055e-06, "loss": 0.0051, "reward": 1.0141249895095825, "reward_std": 0.3262052536010742, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.3891249895095825, "step": 920 }, { "completion_length": 133.25, "epoch": 0.492776886035313, "grad_norm": 1.1953731775283813, "kl": 0.15774795413017273, "learning_rate": 4.93746812882903e-06, "loss": 0.0063, "reward": 2.4814374446868896, "reward_std": 0.7899115085601807, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4970625042915344, "step": 921 }, { "completion_length": 156.03125, "epoch": 0.4933119315141787, "grad_norm": 0.939202070236206, "kl": 0.125453919172287, "learning_rate": 4.9371217126946506e-06, "loss": 0.005, "reward": 1.234624981880188, "reward_std": 0.6840736865997314, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.3908750116825104, "step": 922 }, { "completion_length": 150.4375, "epoch": 0.4938469769930444, "grad_norm": 1.4293978214263916, "kl": 0.2120893895626068, "learning_rate": 4.936774351885194e-06, "loss": 0.0085, "reward": 1.6335313320159912, "reward_std": 1.0719177722930908, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.38353124260902405, "step": 923 }, { "completion_length": 153.1875, "epoch": 0.4943820224719101, "grad_norm": 1.7343331575393677, "kl": 0.16154812276363373, "learning_rate": 4.936426046535303e-06, "loss": 0.0065, "reward": 1.3163750171661377, "reward_std": 0.8580189943313599, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.4101249873638153, "step": 924 }, { "completion_length": 140.65625, "epoch": 0.49491706795077584, "grad_norm": 2.4589593410491943, "kl": 0.14875394105911255, "learning_rate": 4.936076796779988e-06, "loss": 0.006, "reward": 2.066281318664551, "reward_std": 0.8261409401893616, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.441281259059906, "step": 925 }, { "completion_length": 136.21875, "epoch": 0.4954521134296415, "grad_norm": 2.2356550693511963, "kl": 0.17400804162025452, "learning_rate": 4.935726602754626e-06, "loss": 0.007, "reward": 1.2615312337875366, "reward_std": 0.37178099155426025, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4802812337875366, "step": 926 }, { "completion_length": 119.4375, "epoch": 0.4959871589085072, "grad_norm": 1.1838778257369995, "kl": 0.20302022993564606, "learning_rate": 4.935375464594957e-06, "loss": 0.0081, "reward": 1.5082499980926514, "reward_std": 0.7983192801475525, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.44574999809265137, "step": 927 }, { "completion_length": 142.0625, "epoch": 0.49652220438737293, "grad_norm": 0.734818696975708, "kl": 0.16555540263652802, "learning_rate": 4.93502338243709e-06, "loss": 0.0066, "reward": 1.7550938129425049, "reward_std": 0.5768139958381653, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4269687533378601, "step": 928 }, { "completion_length": 139.375, "epoch": 0.4970572498662386, "grad_norm": 7.964299201965332, "kl": 0.185215026140213, "learning_rate": 4.934670356417498e-06, "loss": 0.0074, "reward": 1.5756874084472656, "reward_std": 0.9797823429107666, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.265625, "rewards/xmlcount_reward_func": 0.4038124978542328, "step": 929 }, { "completion_length": 127.9375, "epoch": 0.49759229534510435, "grad_norm": 0.8399348855018616, "kl": 0.16455698013305664, "learning_rate": 4.934316386673022e-06, "loss": 0.0066, "reward": 1.9564374685287476, "reward_std": 0.8547614812850952, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47206246852874756, "step": 930 }, { "completion_length": 147.8125, "epoch": 0.49812734082397003, "grad_norm": 1.3611494302749634, "kl": 0.13728366792201996, "learning_rate": 4.933961473340866e-06, "loss": 0.0055, "reward": 1.6690000295639038, "reward_std": 0.844332754611969, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4189999997615814, "step": 931 }, { "completion_length": 120.71875, "epoch": 0.4986623863028357, "grad_norm": 1.4848908185958862, "kl": 0.19724757969379425, "learning_rate": 4.933605616558603e-06, "loss": 0.0079, "reward": 1.9565937519073486, "reward_std": 1.0536401271820068, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48784375190734863, "step": 932 }, { "completion_length": 146.0, "epoch": 0.49919743178170145, "grad_norm": 22.697467803955078, "kl": 2.6762187480926514, "learning_rate": 4.933248816464168e-06, "loss": 0.107, "reward": 1.848312497138977, "reward_std": 0.6666223406791687, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47331249713897705, "step": 933 }, { "completion_length": 121.4375, "epoch": 0.49973247726056713, "grad_norm": 1.8494491577148438, "kl": 0.15497034788131714, "learning_rate": 4.932891073195864e-06, "loss": 0.0062, "reward": 1.9609375, "reward_std": 0.9964455366134644, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4765625, "step": 934 }, { "completion_length": 131.5, "epoch": 0.5002675227394329, "grad_norm": 1.490196943283081, "kl": 0.18846097588539124, "learning_rate": 4.93253238689236e-06, "loss": 0.0075, "reward": 2.16796875, "reward_std": 0.9279071092605591, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.44921875, "step": 935 }, { "completion_length": 125.03125, "epoch": 0.5008025682182986, "grad_norm": 1.0256959199905396, "kl": 0.20979952812194824, "learning_rate": 4.932172757692689e-06, "loss": 0.0084, "reward": 1.9262499809265137, "reward_std": 0.8839100003242493, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.47312498092651367, "step": 936 }, { "completion_length": 155.46875, "epoch": 0.5013376136971642, "grad_norm": 4.142951011657715, "kl": 0.1702761948108673, "learning_rate": 4.93181218573625e-06, "loss": 0.0068, "reward": 1.7421875, "reward_std": 0.8318495750427246, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4765625, "step": 937 }, { "completion_length": 110.5, "epoch": 0.50187265917603, "grad_norm": 19218.626953125, "kl": 2201.55517578125, "learning_rate": 4.931450671162809e-06, "loss": 88.0622, "reward": 2.2820310592651367, "reward_std": 0.7384030818939209, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.46953123807907104, "step": 938 }, { "completion_length": 123.875, "epoch": 0.5024077046548957, "grad_norm": 7.695942401885986, "kl": 0.1306128203868866, "learning_rate": 4.9310882141124935e-06, "loss": 0.0052, "reward": 2.28515625, "reward_std": 1.108654260635376, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48828125, "step": 939 }, { "completion_length": 153.625, "epoch": 0.5029427501337613, "grad_norm": 0.9030515551567078, "kl": 0.14428086578845978, "learning_rate": 4.930724814725801e-06, "loss": 0.0058, "reward": 1.2481563091278076, "reward_std": 0.8189075589179993, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.40440624952316284, "step": 940 }, { "completion_length": 152.59375, "epoch": 0.5034777956126271, "grad_norm": 0.8559492826461792, "kl": 0.16604405641555786, "learning_rate": 4.930360473143591e-06, "loss": 0.0066, "reward": 2.3363125324249268, "reward_std": 1.2162983417510986, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.43006250262260437, "step": 941 }, { "completion_length": 125.96875, "epoch": 0.5040128410914928, "grad_norm": 2.369502544403076, "kl": 0.174729585647583, "learning_rate": 4.9299951895070894e-06, "loss": 0.007, "reward": 1.8418437242507935, "reward_std": 0.6116156578063965, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48246875405311584, "step": 942 }, { "completion_length": 119.90625, "epoch": 0.5045478865703584, "grad_norm": 294.244384765625, "kl": 59.883758544921875, "learning_rate": 4.929628963957888e-06, "loss": 2.3953, "reward": 1.8498749732971191, "reward_std": 0.3271135091781616, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.49050000309944153, "step": 943 }, { "completion_length": 131.625, "epoch": 0.5050829320492242, "grad_norm": 0.693569004535675, "kl": 0.12775743007659912, "learning_rate": 4.929261796637942e-06, "loss": 0.0051, "reward": 1.7383124828338623, "reward_std": 0.5653011202812195, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4883124828338623, "step": 944 }, { "completion_length": 144.125, "epoch": 0.5056179775280899, "grad_norm": 1.0331002473831177, "kl": 0.2363934963941574, "learning_rate": 4.928893687689572e-06, "loss": 0.0095, "reward": 1.8905000686645508, "reward_std": 0.7983402013778687, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.453000009059906, "step": 945 }, { "completion_length": 152.59375, "epoch": 0.5061530230069556, "grad_norm": 11.855405807495117, "kl": 0.21105293929576874, "learning_rate": 4.928524637255465e-06, "loss": 0.0084, "reward": 1.28125, "reward_std": 0.33156850934028625, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.46875, "step": 946 }, { "completion_length": 120.34375, "epoch": 0.5066880684858213, "grad_norm": 9.623554229736328, "kl": 0.16952654719352722, "learning_rate": 4.928154645478672e-06, "loss": 0.0068, "reward": 2.078125, "reward_std": 0.713301420211792, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.5, "step": 947 }, { "completion_length": 111.28125, "epoch": 0.507223113964687, "grad_norm": 5056611.0, "kl": 433965.1875, "learning_rate": 4.927783712502609e-06, "loss": 17358.6094, "reward": 2.360562562942505, "reward_std": 0.8077459931373596, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4855625033378601, "step": 948 }, { "completion_length": 143.71875, "epoch": 0.5077581594435527, "grad_norm": 2.800220012664795, "kl": 0.1910908818244934, "learning_rate": 4.927411838471055e-06, "loss": 0.0076, "reward": 1.6075937747955322, "reward_std": 0.6693339943885803, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4669687747955322, "step": 949 }, { "completion_length": 133.75, "epoch": 0.5082932049224184, "grad_norm": 41.808387756347656, "kl": 4.971235752105713, "learning_rate": 4.9270390235281574e-06, "loss": 0.1988, "reward": 2.113187313079834, "reward_std": 1.1924149990081787, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4413124918937683, "step": 950 }, { "completion_length": 126.28125, "epoch": 0.5088282504012841, "grad_norm": 9.975041389465332, "kl": 0.19353148341178894, "learning_rate": 4.926665267818425e-06, "loss": 0.0077, "reward": 2.188999891281128, "reward_std": 0.9244208335876465, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4702500104904175, "step": 951 }, { "completion_length": 150.96875, "epoch": 0.5093632958801498, "grad_norm": 1.4148380756378174, "kl": 0.13061660528182983, "learning_rate": 4.926290571486736e-06, "loss": 0.0052, "reward": 1.69140625, "reward_std": 0.9537132382392883, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.44140625, "step": 952 }, { "completion_length": 123.8125, "epoch": 0.5098983413590155, "grad_norm": 3.1126010417938232, "kl": 0.17832103371620178, "learning_rate": 4.925914934678326e-06, "loss": 0.0071, "reward": 2.5717811584472656, "reward_std": 0.9310402870178223, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4780312776565552, "step": 953 }, { "completion_length": 149.03125, "epoch": 0.5104333868378812, "grad_norm": 2.0590851306915283, "kl": 0.12227081507444382, "learning_rate": 4.925538357538801e-06, "loss": 0.0049, "reward": 1.2443125247955322, "reward_std": 0.8447738885879517, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.41618749499320984, "step": 954 }, { "completion_length": 128.5625, "epoch": 0.5109684323167469, "grad_norm": 337.5967102050781, "kl": 21.696563720703125, "learning_rate": 4.9251608402141275e-06, "loss": 0.8679, "reward": 1.9595625400543213, "reward_std": 0.6396902799606323, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4908125102519989, "step": 955 }, { "completion_length": 152.4375, "epoch": 0.5115034777956127, "grad_norm": 1.3691867589950562, "kl": 0.13735485076904297, "learning_rate": 4.924782382850641e-06, "loss": 0.0055, "reward": 1.5176875591278076, "reward_std": 0.7984777688980103, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.45518749952316284, "step": 956 }, { "completion_length": 121.125, "epoch": 0.5120385232744783, "grad_norm": 7.563607692718506, "kl": 0.1707065999507904, "learning_rate": 4.924402985595037e-06, "loss": 0.0068, "reward": 2.686187505722046, "reward_std": 1.0035207271575928, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4986875057220459, "step": 957 }, { "completion_length": 139.25, "epoch": 0.512573568753344, "grad_norm": 1.4314287900924683, "kl": 0.16424565017223358, "learning_rate": 4.924022648594379e-06, "loss": 0.0066, "reward": 1.7460312843322754, "reward_std": 0.8476369976997375, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.449156254529953, "step": 958 }, { "completion_length": 125.09375, "epoch": 0.5131086142322098, "grad_norm": 2.658806324005127, "kl": 0.24325701594352722, "learning_rate": 4.923641371996091e-06, "loss": 0.0097, "reward": 1.9891250133514404, "reward_std": 0.8479989767074585, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.47350001335144043, "step": 959 }, { "completion_length": 153.3125, "epoch": 0.5136436597110754, "grad_norm": 2.5974316596984863, "kl": 0.12189368158578873, "learning_rate": 4.923259155947965e-06, "loss": 0.0049, "reward": 1.0590312480926514, "reward_std": 0.49737972021102905, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.37153124809265137, "step": 960 }, { "completion_length": 144.84375, "epoch": 0.5141787051899411, "grad_norm": 1.926938533782959, "kl": 0.2018618881702423, "learning_rate": 4.922876000598153e-06, "loss": 0.0081, "reward": 1.8216875791549683, "reward_std": 0.9564383625984192, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.4310625195503235, "step": 961 }, { "completion_length": 142.03125, "epoch": 0.5147137506688069, "grad_norm": 3.3073480129241943, "kl": 0.16302461922168732, "learning_rate": 4.922491906095175e-06, "loss": 0.0065, "reward": 1.8021249771118164, "reward_std": 1.081573247909546, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4583750069141388, "step": 962 }, { "completion_length": 138.09375, "epoch": 0.5152487961476726, "grad_norm": 3.0606689453125, "kl": 0.21830469369888306, "learning_rate": 4.922106872587913e-06, "loss": 0.0087, "reward": 1.420468807220459, "reward_std": 0.8154971599578857, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4204687476158142, "step": 963 }, { "completion_length": 130.5625, "epoch": 0.5157838416265382, "grad_norm": 1.145822286605835, "kl": 0.15515443682670593, "learning_rate": 4.921720900225613e-06, "loss": 0.0062, "reward": 2.5126874446868896, "reward_std": 1.049626350402832, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4814375042915344, "step": 964 }, { "completion_length": 125.15625, "epoch": 0.516318887105404, "grad_norm": 1.9805179834365845, "kl": 0.16429638862609863, "learning_rate": 4.921333989157885e-06, "loss": 0.0066, "reward": 2.200500011444092, "reward_std": 1.003607988357544, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4817500114440918, "step": 965 }, { "completion_length": 128.46875, "epoch": 0.5168539325842697, "grad_norm": 37870012.0, "kl": 1108979.625, "learning_rate": 4.920946139534704e-06, "loss": 44359.1836, "reward": 1.77734375, "reward_std": 0.9131333231925964, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.46484375, "step": 966 }, { "completion_length": 146.53125, "epoch": 0.5173889780631353, "grad_norm": 6.201975345611572, "kl": 0.24458631873130798, "learning_rate": 4.920557351506409e-06, "loss": 0.0098, "reward": 1.6287813186645508, "reward_std": 0.7350428700447083, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.488156259059906, "step": 967 }, { "completion_length": 159.25, "epoch": 0.5179240235420011, "grad_norm": 2.1299540996551514, "kl": 0.12871429324150085, "learning_rate": 4.920167625223699e-06, "loss": 0.0051, "reward": 1.375, "reward_std": 0.5680021643638611, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.40625, "step": 968 }, { "completion_length": 157.65625, "epoch": 0.5184590690208668, "grad_norm": 1.8356574773788452, "kl": 0.11166861653327942, "learning_rate": 4.919776960837641e-06, "loss": 0.0045, "reward": 1.2107499837875366, "reward_std": 0.7032003402709961, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.25, "rewards/xmlcount_reward_func": 0.367000013589859, "step": 969 }, { "completion_length": 95.875, "epoch": 0.5189941144997324, "grad_norm": 2.5016067028045654, "kl": 0.2325916290283203, "learning_rate": 4.919385358499664e-06, "loss": 0.0093, "reward": 2.450312614440918, "reward_std": 0.8320460319519043, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4971874952316284, "step": 970 }, { "completion_length": 153.9375, "epoch": 0.5195291599785982, "grad_norm": 0.8358108997344971, "kl": 0.19332124292850494, "learning_rate": 4.91899281836156e-06, "loss": 0.0077, "reward": 1.4723124504089355, "reward_std": 0.6891840100288391, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4254375100135803, "step": 971 }, { "completion_length": 131.34375, "epoch": 0.5200642054574639, "grad_norm": 1.230191707611084, "kl": 0.1578829437494278, "learning_rate": 4.9185993405754856e-06, "loss": 0.0063, "reward": 1.429843783378601, "reward_std": 0.7992020845413208, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4610937535762787, "step": 972 }, { "completion_length": 130.78125, "epoch": 0.5205992509363296, "grad_norm": 1.3281919956207275, "kl": 0.200199693441391, "learning_rate": 4.91820492529396e-06, "loss": 0.008, "reward": 2.210031270980835, "reward_std": 0.4492960274219513, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4444062411785126, "step": 973 }, { "completion_length": 110.09375, "epoch": 0.5211342964151953, "grad_norm": 1.0989501476287842, "kl": 0.19652059674263, "learning_rate": 4.917809572669867e-06, "loss": 0.0079, "reward": 2.384500026702881, "reward_std": 0.8335615396499634, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.47825002670288086, "step": 974 }, { "completion_length": 137.09375, "epoch": 0.521669341894061, "grad_norm": 1.1475099325180054, "kl": 0.15325719118118286, "learning_rate": 4.9174132828564505e-06, "loss": 0.0061, "reward": 1.7864375114440918, "reward_std": 0.4718078672885895, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4583125114440918, "step": 975 }, { "completion_length": 126.8125, "epoch": 0.5222043873729267, "grad_norm": 3.385530710220337, "kl": 0.22389890253543854, "learning_rate": 4.917016056007323e-06, "loss": 0.009, "reward": 2.1243748664855957, "reward_std": 0.9978647828102112, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.49937498569488525, "step": 976 }, { "completion_length": 174.21875, "epoch": 0.5227394328517924, "grad_norm": 1.4526376724243164, "kl": 0.10722661018371582, "learning_rate": 4.916617892276455e-06, "loss": 0.0043, "reward": 0.7724375128746033, "reward_std": 0.5574971437454224, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.1875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.25, "rewards/xmlcount_reward_func": 0.3349374830722809, "step": 977 }, { "completion_length": 129.90625, "epoch": 0.5232744783306581, "grad_norm": 1010551.625, "kl": 78934.6171875, "learning_rate": 4.9162187918181836e-06, "loss": 3157.3843, "reward": 1.6795624494552612, "reward_std": 0.668827474117279, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.460812509059906, "step": 978 }, { "completion_length": 158.03125, "epoch": 0.5238095238095238, "grad_norm": 1.2591569423675537, "kl": 0.17394763231277466, "learning_rate": 4.9158187547872075e-06, "loss": 0.007, "reward": 1.0229063034057617, "reward_std": 0.6198656558990479, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.38228124380111694, "step": 979 }, { "completion_length": 129.59375, "epoch": 0.5243445692883895, "grad_norm": 19018.9375, "kl": 3794.295654296875, "learning_rate": 4.915417781338588e-06, "loss": 151.7718, "reward": 2.030125141143799, "reward_std": 1.0136206150054932, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4676249921321869, "step": 980 }, { "completion_length": 131.71875, "epoch": 0.5248796147672552, "grad_norm": 12.107614517211914, "kl": 1.7658331394195557, "learning_rate": 4.915015871627752e-06, "loss": 0.0706, "reward": 1.81640625, "reward_std": 0.6821363568305969, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.48828125, "step": 981 }, { "completion_length": 112.125, "epoch": 0.5254146602461209, "grad_norm": 0.9887367486953735, "kl": 0.2233508825302124, "learning_rate": 4.914613025810485e-06, "loss": 0.0089, "reward": 2.288343906402588, "reward_std": 0.9322530031204224, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.47584375739097595, "step": 982 }, { "completion_length": 137.03125, "epoch": 0.5259497057249867, "grad_norm": 1.16054105758667, "kl": 0.20803813636302948, "learning_rate": 4.914209244042939e-06, "loss": 0.0083, "reward": 1.7600936889648438, "reward_std": 0.8811836242675781, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4475937485694885, "step": 983 }, { "completion_length": 155.3125, "epoch": 0.5264847512038523, "grad_norm": 0.6777178049087524, "kl": 0.13235460221767426, "learning_rate": 4.913804526481628e-06, "loss": 0.0053, "reward": 1.1946874856948853, "reward_std": 0.8384947776794434, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.39781248569488525, "step": 984 }, { "completion_length": 127.4375, "epoch": 0.527019796682718, "grad_norm": 2.127840280532837, "kl": 0.22988778352737427, "learning_rate": 4.913398873283427e-06, "loss": 0.0092, "reward": 2.109375, "reward_std": 0.9154088497161865, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.484375, "step": 985 }, { "completion_length": 119.96875, "epoch": 0.5275548421615838, "grad_norm": 3.2847089767456055, "kl": 0.21523064374923706, "learning_rate": 4.912992284605577e-06, "loss": 0.0086, "reward": 2.361093759536743, "reward_std": 0.7349543571472168, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48609375953674316, "step": 986 }, { "completion_length": 142.4375, "epoch": 0.5280898876404494, "grad_norm": 1.3032996654510498, "kl": 0.19107556343078613, "learning_rate": 4.912584760605677e-06, "loss": 0.0076, "reward": 1.2910000085830688, "reward_std": 0.7520906925201416, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.44724997878074646, "step": 987 }, { "completion_length": 137.96875, "epoch": 0.5286249331193151, "grad_norm": 11.856766700744629, "kl": 0.4749090075492859, "learning_rate": 4.912176301441693e-06, "loss": 0.019, "reward": 1.576624870300293, "reward_std": 0.8079310655593872, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4203749895095825, "step": 988 }, { "completion_length": 115.625, "epoch": 0.5291599785981809, "grad_norm": 1.6873747110366821, "kl": 0.2095797061920166, "learning_rate": 4.9117669072719506e-06, "loss": 0.0084, "reward": 2.359375, "reward_std": 0.824476957321167, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.5, "step": 989 }, { "completion_length": 127.84375, "epoch": 0.5296950240770465, "grad_norm": 2.824720621109009, "kl": 0.26570451259613037, "learning_rate": 4.911356578255139e-06, "loss": 0.0106, "reward": 2.090968608856201, "reward_std": 1.1036500930786133, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4815937578678131, "step": 990 }, { "completion_length": 124.09375, "epoch": 0.5302300695559122, "grad_norm": 5.265120983123779, "kl": 0.20941542088985443, "learning_rate": 4.91094531455031e-06, "loss": 0.0084, "reward": 1.400031328201294, "reward_std": 0.6878117918968201, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.44690626859664917, "step": 991 }, { "completion_length": 107.53125, "epoch": 0.530765115034778, "grad_norm": 1.3389049768447876, "kl": 0.22039085626602173, "learning_rate": 4.910533116316878e-06, "loss": 0.0088, "reward": 2.4746875762939453, "reward_std": 0.9471460580825806, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.49031251668930054, "step": 992 }, { "completion_length": 131.03125, "epoch": 0.5313001605136437, "grad_norm": 0.9441580772399902, "kl": 0.16653862595558167, "learning_rate": 4.910119983714616e-06, "loss": 0.0067, "reward": 1.9137499332427979, "reward_std": 0.45332401990890503, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4606249928474426, "step": 993 }, { "completion_length": 121.125, "epoch": 0.5318352059925093, "grad_norm": 1.3954780101776123, "kl": 0.2859213352203369, "learning_rate": 4.909705916903666e-06, "loss": 0.0114, "reward": 2.5595624446868896, "reward_std": 0.7436521053314209, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4501875042915344, "step": 994 }, { "completion_length": 146.28125, "epoch": 0.5323702514713751, "grad_norm": 2705460.25, "kl": 11156.078125, "learning_rate": 4.909290916044525e-06, "loss": 446.2432, "reward": 2.384718894958496, "reward_std": 1.1251639127731323, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.44721874594688416, "step": 995 }, { "completion_length": 138.40625, "epoch": 0.5329052969502408, "grad_norm": 0.9972122311592102, "kl": 0.173615962266922, "learning_rate": 4.908874981298058e-06, "loss": 0.0069, "reward": 1.9474999904632568, "reward_std": 1.0634565353393555, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.43187499046325684, "step": 996 }, { "completion_length": 155.84375, "epoch": 0.5334403424291064, "grad_norm": 3.158629894256592, "kl": 0.1542002260684967, "learning_rate": 4.908458112825487e-06, "loss": 0.0062, "reward": 1.4428436756134033, "reward_std": 0.8299652338027954, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.28125, "rewards/xmlcount_reward_func": 0.4428437352180481, "step": 997 }, { "completion_length": 139.5625, "epoch": 0.5339753879079722, "grad_norm": 20.884645462036133, "kl": 0.512790858745575, "learning_rate": 4.908040310788399e-06, "loss": 0.0205, "reward": 2.436312437057495, "reward_std": 0.8375983834266663, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4831874966621399, "step": 998 }, { "completion_length": 116.5625, "epoch": 0.5345104333868379, "grad_norm": 11.140335083007812, "kl": 0.1961006224155426, "learning_rate": 4.907621575348742e-06, "loss": 0.0078, "reward": 2.6247501373291016, "reward_std": 0.8837915658950806, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.499750018119812, "step": 999 }, { "completion_length": 141.0625, "epoch": 0.5350454788657036, "grad_norm": 14.941328048706055, "kl": 0.16896136105060577, "learning_rate": 4.907201906668827e-06, "loss": 0.0068, "reward": 1.2675000429153442, "reward_std": 0.7321887612342834, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.43937501311302185, "step": 1000 }, { "completion_length": 147.5625, "epoch": 0.5355805243445693, "grad_norm": 3.9846928119659424, "kl": 0.14992932975292206, "learning_rate": 4.9067813049113246e-06, "loss": 0.006, "reward": 1.879312515258789, "reward_std": 0.7691614627838135, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.47306251525878906, "step": 1001 }, { "completion_length": 141.875, "epoch": 0.536115569823435, "grad_norm": 0.7336996793746948, "kl": 0.199136421084404, "learning_rate": 4.906359770239267e-06, "loss": 0.008, "reward": 1.7567499876022339, "reward_std": 1.016911268234253, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4754999876022339, "step": 1002 }, { "completion_length": 134.75, "epoch": 0.5366506153023007, "grad_norm": 2.522775173187256, "kl": 0.16615206003189087, "learning_rate": 4.905937302816052e-06, "loss": 0.0066, "reward": 2.5953125953674316, "reward_std": 0.8418669700622559, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4703125059604645, "step": 1003 }, { "completion_length": 130.75, "epoch": 0.5371856607811664, "grad_norm": 1.2267168760299683, "kl": 0.23207470774650574, "learning_rate": 4.905513902805433e-06, "loss": 0.0093, "reward": 1.5625, "reward_std": 0.5368680953979492, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.5, "step": 1004 }, { "completion_length": 114.96875, "epoch": 0.5377207062600321, "grad_norm": 0.7824625372886658, "kl": 0.1631050407886505, "learning_rate": 4.90508957037153e-06, "loss": 0.0065, "reward": 2.6818125247955322, "reward_std": 0.5798236131668091, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.47868749499320984, "step": 1005 }, { "completion_length": 140.0625, "epoch": 0.5382557517388978, "grad_norm": 1.9862316846847534, "kl": 0.18124139308929443, "learning_rate": 4.904664305678822e-06, "loss": 0.0072, "reward": 1.4823124408721924, "reward_std": 0.616513729095459, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.43543750047683716, "step": 1006 }, { "completion_length": 146.75, "epoch": 0.5387907972177635, "grad_norm": 2.724867582321167, "kl": 0.13905103504657745, "learning_rate": 4.904238108892149e-06, "loss": 0.0056, "reward": 2.117000102996826, "reward_std": 1.1477043628692627, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4294999837875366, "step": 1007 }, { "completion_length": 137.375, "epoch": 0.5393258426966292, "grad_norm": 3.271315813064575, "kl": 0.20030492544174194, "learning_rate": 4.903810980176713e-06, "loss": 0.008, "reward": 1.584625005722046, "reward_std": 0.8462921380996704, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4283750057220459, "step": 1008 }, { "completion_length": 139.71875, "epoch": 0.5398608881754949, "grad_norm": 15.578524589538574, "kl": 0.5294256806373596, "learning_rate": 4.903382919698079e-06, "loss": 0.0212, "reward": 1.92578125, "reward_std": 0.8337612152099609, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.48828125, "step": 1009 }, { "completion_length": 138.5625, "epoch": 0.5403959336543607, "grad_norm": 2.191188097000122, "kl": 0.3336067795753479, "learning_rate": 4.902953927622169e-06, "loss": 0.0133, "reward": 1.56640625, "reward_std": 0.792109489440918, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48828125, "step": 1010 }, { "completion_length": 112.59375, "epoch": 0.5409309791332263, "grad_norm": 2.016542673110962, "kl": 0.17477557063102722, "learning_rate": 4.902524004115271e-06, "loss": 0.007, "reward": 1.5342812538146973, "reward_std": 0.9251686334609985, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.44053125381469727, "step": 1011 }, { "completion_length": 146.71875, "epoch": 0.541466024612092, "grad_norm": 1.2317475080490112, "kl": 0.13121291995048523, "learning_rate": 4.90209314934403e-06, "loss": 0.0052, "reward": 1.4611562490463257, "reward_std": 1.0920019149780273, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.25, "rewards/xmlcount_reward_func": 0.3674062490463257, "step": 1012 }, { "completion_length": 107.5625, "epoch": 0.5420010700909578, "grad_norm": 3.3406245708465576, "kl": 0.2400461733341217, "learning_rate": 4.901661363475453e-06, "loss": 0.0096, "reward": 2.9283437728881836, "reward_std": 0.9357098340988159, "rewards/correctness_reward_func": 1.5625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4752187728881836, "step": 1013 }, { "completion_length": 126.15625, "epoch": 0.5425361155698234, "grad_norm": 2.2501776218414307, "kl": 0.147504523396492, "learning_rate": 4.90122864667691e-06, "loss": 0.0059, "reward": 2.0624375343322754, "reward_std": 0.7780464887619019, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.484312504529953, "step": 1014 }, { "completion_length": 127.6875, "epoch": 0.5430711610486891, "grad_norm": 2.3849756717681885, "kl": 0.21685689687728882, "learning_rate": 4.90079499911613e-06, "loss": 0.0087, "reward": 2.3263750076293945, "reward_std": 0.49144452810287476, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46700000762939453, "step": 1015 }, { "completion_length": 130.125, "epoch": 0.5436062065275549, "grad_norm": 1.023421049118042, "kl": 0.18747559189796448, "learning_rate": 4.9003604209612025e-06, "loss": 0.0075, "reward": 2.2759687900543213, "reward_std": 0.401464581489563, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4634687602519989, "step": 1016 }, { "completion_length": 137.9375, "epoch": 0.5441412520064205, "grad_norm": 5.113796234130859, "kl": 0.15003012120723724, "learning_rate": 4.8999249123805796e-06, "loss": 0.006, "reward": 2.623281240463257, "reward_std": 1.150266170501709, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4357812702655792, "step": 1017 }, { "completion_length": 150.1875, "epoch": 0.5446762974852862, "grad_norm": 7.604515075683594, "kl": 0.14395996928215027, "learning_rate": 4.89948847354307e-06, "loss": 0.0058, "reward": 1.1286561489105225, "reward_std": 0.7247155904769897, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.234375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.265625, "rewards/xmlcount_reward_func": 0.378656268119812, "step": 1018 }, { "completion_length": 156.9375, "epoch": 0.545211342964152, "grad_norm": 219.1999969482422, "kl": 6.055263996124268, "learning_rate": 4.8990511046178494e-06, "loss": 0.2422, "reward": 1.01953125, "reward_std": 0.8443783521652222, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.1875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.234375, "rewards/xmlcount_reward_func": 0.34765625, "step": 1019 }, { "completion_length": 116.0625, "epoch": 0.5457463884430177, "grad_norm": 0.5860896110534668, "kl": 0.16101649403572083, "learning_rate": 4.898612805774448e-06, "loss": 0.0064, "reward": 2.6030938625335693, "reward_std": 0.8422538042068481, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4780937433242798, "step": 1020 }, { "completion_length": 153.71875, "epoch": 0.5462814339218833, "grad_norm": 2.5642051696777344, "kl": 0.19013440608978271, "learning_rate": 4.8981735771827585e-06, "loss": 0.0076, "reward": 1.3812499046325684, "reward_std": 0.7029690146446228, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.3812499940395355, "step": 1021 }, { "completion_length": 123.875, "epoch": 0.5468164794007491, "grad_norm": 1.044249415397644, "kl": 0.19875067472457886, "learning_rate": 4.897733419013036e-06, "loss": 0.008, "reward": 1.8748750686645508, "reward_std": 1.0557219982147217, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.453000009059906, "step": 1022 }, { "completion_length": 144.875, "epoch": 0.5473515248796148, "grad_norm": 3.996394634246826, "kl": 0.2594119608402252, "learning_rate": 4.897292331435893e-06, "loss": 0.0104, "reward": 2.2000625133514404, "reward_std": 0.7741168737411499, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.41881251335144043, "step": 1023 }, { "completion_length": 140.3125, "epoch": 0.5478865703584804, "grad_norm": 1.3996096849441528, "kl": 0.2030351459980011, "learning_rate": 4.8968503146223046e-06, "loss": 0.0081, "reward": 1.8532500267028809, "reward_std": 0.8533661365509033, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.44699999690055847, "step": 1024 }, { "completion_length": 161.25, "epoch": 0.5484216158373462, "grad_norm": 4.044897079467773, "kl": 0.2039957344532013, "learning_rate": 4.896407368743603e-06, "loss": 0.0082, "reward": 1.389937400817871, "reward_std": 1.0541731119155884, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.38993749022483826, "step": 1025 }, { "completion_length": 148.6875, "epoch": 0.5489566613162119, "grad_norm": 3.5237746238708496, "kl": 0.1401701122522354, "learning_rate": 4.895963493971485e-06, "loss": 0.0056, "reward": 1.9214999675750732, "reward_std": 0.5722250938415527, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.42149999737739563, "step": 1026 }, { "completion_length": 152.25, "epoch": 0.5494917067950775, "grad_norm": 1.0528616905212402, "kl": 0.12396860122680664, "learning_rate": 4.895518690478004e-06, "loss": 0.005, "reward": 1.23228120803833, "reward_std": 0.7246827483177185, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.41978123784065247, "step": 1027 }, { "completion_length": 118.65625, "epoch": 0.5500267522739433, "grad_norm": 2.2891597747802734, "kl": 0.2414444386959076, "learning_rate": 4.8950729584355735e-06, "loss": 0.0097, "reward": 2.2569375038146973, "reward_std": 0.9425349235534668, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.49131250381469727, "step": 1028 }, { "completion_length": 129.96875, "epoch": 0.550561797752809, "grad_norm": 1.6091418266296387, "kl": 0.1418222337961197, "learning_rate": 4.894626298016969e-06, "loss": 0.0057, "reward": 2.0980312824249268, "reward_std": 0.43386924266815186, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.44178125262260437, "step": 1029 }, { "completion_length": 139.8125, "epoch": 0.5510968432316747, "grad_norm": 1.5471723079681396, "kl": 0.20132161676883698, "learning_rate": 4.894178709395324e-06, "loss": 0.0081, "reward": 2.042375087738037, "reward_std": 0.7808854579925537, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.265625, "rewards/xmlcount_reward_func": 0.44862496852874756, "step": 1030 }, { "completion_length": 126.59375, "epoch": 0.5516318887105404, "grad_norm": 5.207352638244629, "kl": 0.22124378383159637, "learning_rate": 4.8937301927441325e-06, "loss": 0.0088, "reward": 1.88671875, "reward_std": 0.759105384349823, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.46484375, "step": 1031 }, { "completion_length": 151.25, "epoch": 0.5521669341894061, "grad_norm": 690.578857421875, "kl": 45.3003044128418, "learning_rate": 4.893280748237248e-06, "loss": 1.812, "reward": 1.4873125553131104, "reward_std": 0.8735317587852478, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4248124957084656, "step": 1032 }, { "completion_length": 134.5, "epoch": 0.5527019796682718, "grad_norm": 8.249608039855957, "kl": 0.31370261311531067, "learning_rate": 4.892830376048884e-06, "loss": 0.0125, "reward": 2.1665937900543213, "reward_std": 0.8378809690475464, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4478437304496765, "step": 1033 }, { "completion_length": 136.96875, "epoch": 0.5532370251471375, "grad_norm": 1.455958366394043, "kl": 0.18248268961906433, "learning_rate": 4.892379076353613e-06, "loss": 0.0073, "reward": 1.606374979019165, "reward_std": 0.6106572151184082, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4657500088214874, "step": 1034 }, { "completion_length": 146.96875, "epoch": 0.5537720706260032, "grad_norm": 0.7765586972236633, "kl": 0.13001564145088196, "learning_rate": 4.8919268493263686e-06, "loss": 0.0052, "reward": 1.3089063167572021, "reward_std": 0.7367780208587646, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.4182812571525574, "step": 1035 }, { "completion_length": 134.09375, "epoch": 0.5543071161048689, "grad_norm": 2.0488696098327637, "kl": 0.1727883368730545, "learning_rate": 4.891473695142441e-06, "loss": 0.0069, "reward": 1.7694687843322754, "reward_std": 0.9517471790313721, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.441343754529953, "step": 1036 }, { "completion_length": 149.40625, "epoch": 0.5548421615837347, "grad_norm": 4.741456031799316, "kl": 0.17348739504814148, "learning_rate": 4.891019613977484e-06, "loss": 0.0069, "reward": 2.031125068664551, "reward_std": 0.889182984828949, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.453000009059906, "step": 1037 }, { "completion_length": 143.1875, "epoch": 0.5553772070626003, "grad_norm": 0.8577002286911011, "kl": 0.18250346183776855, "learning_rate": 4.890564606007504e-06, "loss": 0.0073, "reward": 1.9911251068115234, "reward_std": 1.259606957435608, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.4286250174045563, "step": 1038 }, { "completion_length": 134.34375, "epoch": 0.555912252541466, "grad_norm": 1.6741232872009277, "kl": 0.21037863194942474, "learning_rate": 4.8901086714088744e-06, "loss": 0.0084, "reward": 2.358656406402588, "reward_std": 0.8557361960411072, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46803125739097595, "step": 1039 }, { "completion_length": 143.6875, "epoch": 0.5564472980203318, "grad_norm": 0.931276261806488, "kl": 0.2234838604927063, "learning_rate": 4.889651810358323e-06, "loss": 0.0089, "reward": 2.0928125381469727, "reward_std": 0.6723317503929138, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.49906250834465027, "step": 1040 }, { "completion_length": 121.0625, "epoch": 0.5569823434991974, "grad_norm": 4.942202568054199, "kl": 0.17898184061050415, "learning_rate": 4.889194023032938e-06, "loss": 0.0072, "reward": 1.8510000705718994, "reward_std": 0.7708742618560791, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.47600001096725464, "step": 1041 }, { "completion_length": 136.09375, "epoch": 0.5575173889780631, "grad_norm": 2.5838189125061035, "kl": 0.18229244649410248, "learning_rate": 4.8887353096101665e-06, "loss": 0.0073, "reward": 1.7848124504089355, "reward_std": 1.0391665697097778, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4410625100135803, "step": 1042 }, { "completion_length": 135.8125, "epoch": 0.5580524344569289, "grad_norm": 73.04996490478516, "kl": 2.069328784942627, "learning_rate": 4.888275670267814e-06, "loss": 0.0828, "reward": 2.4680001735687256, "reward_std": 1.144526481628418, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.46799999475479126, "step": 1043 }, { "completion_length": 128.5625, "epoch": 0.5585874799357945, "grad_norm": 1.1337534189224243, "kl": 0.18738305568695068, "learning_rate": 4.887815105184048e-06, "loss": 0.0075, "reward": 1.5538749694824219, "reward_std": 0.9044245481491089, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.47574999928474426, "step": 1044 }, { "completion_length": 129.84375, "epoch": 0.5591225254146602, "grad_norm": 1.923997402191162, "kl": 0.18293532729148865, "learning_rate": 4.887353614537388e-06, "loss": 0.0073, "reward": 1.66796875, "reward_std": 0.7009792327880859, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.48046875, "step": 1045 }, { "completion_length": 130.53125, "epoch": 0.559657570893526, "grad_norm": 2.416674852371216, "kl": 0.3670528531074524, "learning_rate": 4.88689119850672e-06, "loss": 0.0147, "reward": 1.9383437633514404, "reward_std": 0.5456063747406006, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.40709376335144043, "step": 1046 }, { "completion_length": 148.9375, "epoch": 0.5601926163723917, "grad_norm": 1.2858270406723022, "kl": 0.1778603494167328, "learning_rate": 4.886427857271284e-06, "loss": 0.0071, "reward": 1.45703125, "reward_std": 0.9320777654647827, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.44140625, "step": 1047 }, { "completion_length": 134.09375, "epoch": 0.5607276618512573, "grad_norm": 2.5623364448547363, "kl": 0.1810072660446167, "learning_rate": 4.885963591010681e-06, "loss": 0.0072, "reward": 1.8665000200271606, "reward_std": 0.8984411358833313, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.47587499022483826, "step": 1048 }, { "completion_length": 142.6875, "epoch": 0.5612627073301231, "grad_norm": 1.319503903388977, "kl": 0.19201433658599854, "learning_rate": 4.885498399904869e-06, "loss": 0.0077, "reward": 1.156656265258789, "reward_std": 0.921699583530426, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.234375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.28125, "rewards/xmlcount_reward_func": 0.39103126525878906, "step": 1049 }, { "completion_length": 146.9375, "epoch": 0.5617977528089888, "grad_norm": 2.4019482135772705, "kl": 0.15042763948440552, "learning_rate": 4.8850322841341645e-06, "loss": 0.006, "reward": 1.9428750276565552, "reward_std": 0.8400539755821228, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4428749978542328, "step": 1050 }, { "completion_length": 110.75, "epoch": 0.5623327982878544, "grad_norm": 311923.3125, "kl": 316.44097900390625, "learning_rate": 4.884565243879243e-06, "loss": 12.6576, "reward": 2.227156162261963, "reward_std": 0.7836116552352905, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.47715625166893005, "step": 1051 }, { "completion_length": 151.0, "epoch": 0.5628678437667202, "grad_norm": 4.35500955581665, "kl": 0.3180682063102722, "learning_rate": 4.8840972793211385e-06, "loss": 0.0127, "reward": 1.3102812767028809, "reward_std": 0.6315192580223083, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.4040312170982361, "step": 1052 }, { "completion_length": 109.78125, "epoch": 0.5634028892455859, "grad_norm": 2.1039464473724365, "kl": 0.20809178054332733, "learning_rate": 4.883628390641243e-06, "loss": 0.0083, "reward": 2.5795936584472656, "reward_std": 0.7144493460655212, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4858437478542328, "step": 1053 }, { "completion_length": 134.875, "epoch": 0.5639379347244515, "grad_norm": 3.930652379989624, "kl": 0.1725364327430725, "learning_rate": 4.8831585780213075e-06, "loss": 0.0069, "reward": 2.0964999198913574, "reward_std": 0.724783182144165, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4715000092983246, "step": 1054 }, { "completion_length": 138.4375, "epoch": 0.5644729802033173, "grad_norm": 1.920764684677124, "kl": 0.11194949597120285, "learning_rate": 4.8826878416434385e-06, "loss": 0.0045, "reward": 2.3801560401916504, "reward_std": 0.9901347756385803, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4270312488079071, "step": 1055 }, { "completion_length": 154.78125, "epoch": 0.565008025682183, "grad_norm": 0.7874099016189575, "kl": 0.1686628758907318, "learning_rate": 4.882216181690105e-06, "loss": 0.0067, "reward": 1.554781198501587, "reward_std": 0.43551287055015564, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.3985312581062317, "step": 1056 }, { "completion_length": 141.46875, "epoch": 0.5655430711610487, "grad_norm": 6.565095901489258, "kl": 0.40029507875442505, "learning_rate": 4.8817435983441285e-06, "loss": 0.016, "reward": 1.8608124256134033, "reward_std": 0.9643735885620117, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4545624852180481, "step": 1057 }, { "completion_length": 157.84375, "epoch": 0.5660781166399144, "grad_norm": 36540472.0, "kl": 2181677.75, "learning_rate": 4.881270091788694e-06, "loss": 87267.125, "reward": 1.5921249389648438, "reward_std": 0.8561820387840271, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4514999985694885, "step": 1058 }, { "completion_length": 122.625, "epoch": 0.5666131621187801, "grad_norm": 5.375333309173584, "kl": 0.8096136450767517, "learning_rate": 4.88079566220734e-06, "loss": 0.0324, "reward": 2.06640625, "reward_std": 0.8191737532615662, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48828125, "step": 1059 }, { "completion_length": 136.4375, "epoch": 0.5671482075976458, "grad_norm": 2.0802063941955566, "kl": 0.12936894595623016, "learning_rate": 4.880320309783964e-06, "loss": 0.0052, "reward": 2.37890625, "reward_std": 0.5739598274230957, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48828125, "step": 1060 }, { "completion_length": 138.25, "epoch": 0.5676832530765115, "grad_norm": 1.385359764099121, "kl": 0.19327673316001892, "learning_rate": 4.8798440347028244e-06, "loss": 0.0077, "reward": 1.92578125, "reward_std": 0.728771448135376, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48828125, "step": 1061 }, { "completion_length": 126.15625, "epoch": 0.5682182985553772, "grad_norm": 3.9788854122161865, "kl": 0.27264294028282166, "learning_rate": 4.879366837148532e-06, "loss": 0.0109, "reward": 2.308468818664551, "reward_std": 0.9036186933517456, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.480343759059906, "step": 1062 }, { "completion_length": 108.09375, "epoch": 0.5687533440342429, "grad_norm": 18.28485870361328, "kl": 0.326016902923584, "learning_rate": 4.878888717306058e-06, "loss": 0.013, "reward": 2.295281171798706, "reward_std": 1.0071489810943604, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48278123140335083, "step": 1063 }, { "completion_length": 149.4375, "epoch": 0.5692883895131086, "grad_norm": 1.0720633268356323, "kl": 0.22292962670326233, "learning_rate": 4.878409675360733e-06, "loss": 0.0089, "reward": 1.1511561870574951, "reward_std": 0.7159637212753296, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.4011562466621399, "step": 1064 }, { "completion_length": 138.9375, "epoch": 0.5698234349919743, "grad_norm": 3.9507181644439697, "kl": 0.1782495379447937, "learning_rate": 4.87792971149824e-06, "loss": 0.0071, "reward": 1.5231249332427979, "reward_std": 0.7501187324523926, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4918749928474426, "step": 1065 }, { "completion_length": 134.65625, "epoch": 0.57035848047084, "grad_norm": 859311936.0, "kl": 1096486.125, "learning_rate": 4.877448825904625e-06, "loss": 43859.4414, "reward": 1.2729687690734863, "reward_std": 0.5325682163238525, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.46046873927116394, "step": 1066 }, { "completion_length": 139.0625, "epoch": 0.5708935259497058, "grad_norm": 8.5892915725708, "kl": 1.0142310857772827, "learning_rate": 4.876967018766286e-06, "loss": 0.0406, "reward": 1.83203125, "reward_std": 0.8918441534042358, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48828125, "step": 1067 }, { "completion_length": 114.53125, "epoch": 0.5714285714285714, "grad_norm": 1.2992644309997559, "kl": 0.3035893738269806, "learning_rate": 4.876484290269982e-06, "loss": 0.0121, "reward": 2.30078125, "reward_std": 1.1315505504608154, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48828125, "step": 1068 }, { "completion_length": 113.1875, "epoch": 0.5719636169074371, "grad_norm": 1.1948047876358032, "kl": 0.22894611954689026, "learning_rate": 4.876000640602827e-06, "loss": 0.0092, "reward": 2.653749942779541, "reward_std": 0.8608795404434204, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4975000023841858, "step": 1069 }, { "completion_length": 114.84375, "epoch": 0.5724986623863029, "grad_norm": 1.269864797592163, "kl": 0.17231428623199463, "learning_rate": 4.875516069952293e-06, "loss": 0.0069, "reward": 2.46875, "reward_std": 1.016597867012024, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.5, "step": 1070 }, { "completion_length": 139.125, "epoch": 0.5730337078651685, "grad_norm": 0.585962176322937, "kl": 0.20687706768512726, "learning_rate": 4.87503057850621e-06, "loss": 0.0083, "reward": 2.02734375, "reward_std": 0.8751394748687744, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.49609375, "step": 1071 }, { "completion_length": 149.03125, "epoch": 0.5735687533440342, "grad_norm": 5.290202617645264, "kl": 0.23836158215999603, "learning_rate": 4.874544166452763e-06, "loss": 0.0095, "reward": 1.95493745803833, "reward_std": 0.3088223338127136, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.37681251764297485, "step": 1072 }, { "completion_length": 123.21875, "epoch": 0.5741037988229, "grad_norm": 1.1154141426086426, "kl": 0.20485232770442963, "learning_rate": 4.874056833980494e-06, "loss": 0.0082, "reward": 2.160875082015991, "reward_std": 1.0645190477371216, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.47337502241134644, "step": 1073 }, { "completion_length": 142.5, "epoch": 0.5746388443017657, "grad_norm": 1.7556183338165283, "kl": 0.1524031162261963, "learning_rate": 4.8735685812783025e-06, "loss": 0.0061, "reward": 1.4825313091278076, "reward_std": 0.6805171966552734, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.43565624952316284, "step": 1074 }, { "completion_length": 124.8125, "epoch": 0.5751738897806313, "grad_norm": 0.7361770868301392, "kl": 0.15802960097789764, "learning_rate": 4.873079408535446e-06, "loss": 0.0063, "reward": 2.625, "reward_std": 0.5016143918037415, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.5, "step": 1075 }, { "completion_length": 141.15625, "epoch": 0.5757089352594971, "grad_norm": 1.7196800708770752, "kl": 0.2398001104593277, "learning_rate": 4.8725893159415365e-06, "loss": 0.0096, "reward": 2.20703125, "reward_std": 0.8902500867843628, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.48828125, "step": 1076 }, { "completion_length": 120.375, "epoch": 0.5762439807383628, "grad_norm": 3.841662645339966, "kl": 0.19016483426094055, "learning_rate": 4.872098303686543e-06, "loss": 0.0076, "reward": 2.4492499828338623, "reward_std": 1.0334084033966064, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4648750126361847, "step": 1077 }, { "completion_length": 157.875, "epoch": 0.5767790262172284, "grad_norm": 123558854656.0, "kl": 685536192.0, "learning_rate": 4.871606371960791e-06, "loss": 27421446.0, "reward": 1.4100937843322754, "reward_std": 1.0081040859222412, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.441343754529953, "step": 1078 }, { "completion_length": 139.625, "epoch": 0.5773140716960942, "grad_norm": 0.6823629140853882, "kl": 0.14674799144268036, "learning_rate": 4.871113520954963e-06, "loss": 0.0059, "reward": 2.0941874980926514, "reward_std": 1.2042444944381714, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.42231249809265137, "step": 1079 }, { "completion_length": 110.8125, "epoch": 0.5778491171749599, "grad_norm": 3.2441391944885254, "kl": 0.33955657482147217, "learning_rate": 4.870619750860099e-06, "loss": 0.0136, "reward": 2.046875, "reward_std": 0.8386964797973633, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.484375, "step": 1080 }, { "completion_length": 136.25, "epoch": 0.5783841626538255, "grad_norm": 10.670232772827148, "kl": 0.3788706064224243, "learning_rate": 4.870125061867591e-06, "loss": 0.0152, "reward": 2.019718647003174, "reward_std": 0.8692156076431274, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.472843736410141, "step": 1081 }, { "completion_length": 134.3125, "epoch": 0.5789192081326913, "grad_norm": 0.9035869836807251, "kl": 0.161890909075737, "learning_rate": 4.869629454169191e-06, "loss": 0.0065, "reward": 2.130312442779541, "reward_std": 0.7093318104743958, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4428125023841858, "step": 1082 }, { "completion_length": 140.59375, "epoch": 0.579454253611557, "grad_norm": 1.3568679094314575, "kl": 0.16624248027801514, "learning_rate": 4.869132927957007e-06, "loss": 0.0066, "reward": 2.1321563720703125, "reward_std": 1.0389089584350586, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46028125286102295, "step": 1083 }, { "completion_length": 133.375, "epoch": 0.5799892990904227, "grad_norm": 2.994912624359131, "kl": 0.5019719004631042, "learning_rate": 4.868635483423501e-06, "loss": 0.0201, "reward": 2.1303439140319824, "reward_std": 1.234791874885559, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4584687352180481, "step": 1084 }, { "completion_length": 136.125, "epoch": 0.5805243445692884, "grad_norm": 1.9120302200317383, "kl": 0.13595765829086304, "learning_rate": 4.868137120761492e-06, "loss": 0.0054, "reward": 2.619906187057495, "reward_std": 1.0465166568756104, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4636562466621399, "step": 1085 }, { "completion_length": 164.0625, "epoch": 0.5810593900481541, "grad_norm": 1067991.0, "kl": 43704.609375, "learning_rate": 4.867637840164156e-06, "loss": 1748.1842, "reward": 1.480625033378601, "reward_std": 1.0858361721038818, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4025000035762787, "step": 1086 }, { "completion_length": 154.40625, "epoch": 0.5815944355270198, "grad_norm": 1.7502624988555908, "kl": 0.14038294553756714, "learning_rate": 4.867137641825022e-06, "loss": 0.0056, "reward": 1.97265625, "reward_std": 1.0183420181274414, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.47265625, "step": 1087 }, { "completion_length": 130.46875, "epoch": 0.5821294810058855, "grad_norm": 2928018.5, "kl": 136767.375, "learning_rate": 4.866636525937978e-06, "loss": 5470.6958, "reward": 1.84375, "reward_std": 0.7538660764694214, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.484375, "step": 1088 }, { "completion_length": 138.34375, "epoch": 0.5826645264847512, "grad_norm": 190373.671875, "kl": 23158.970703125, "learning_rate": 4.8661344926972666e-06, "loss": 926.3588, "reward": 2.354062557220459, "reward_std": 0.8139108419418335, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4634374976158142, "step": 1089 }, { "completion_length": 123.375, "epoch": 0.5831995719636169, "grad_norm": 2.2500739097595215, "kl": 0.238655224442482, "learning_rate": 4.865631542297483e-06, "loss": 0.0095, "reward": 1.3759686946868896, "reward_std": 0.7137773036956787, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.46971872448921204, "step": 1090 }, { "completion_length": 107.875, "epoch": 0.5837346174424826, "grad_norm": 1.1545377969741821, "kl": 0.20171883702278137, "learning_rate": 4.865127674933583e-06, "loss": 0.0081, "reward": 2.481562614440918, "reward_std": 1.0623788833618164, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4971874952316284, "step": 1091 }, { "completion_length": 138.0625, "epoch": 0.5842696629213483, "grad_norm": 5.837179660797119, "kl": 0.5907078981399536, "learning_rate": 4.864622890800874e-06, "loss": 0.0236, "reward": 2.60546875, "reward_std": 0.6084266901016235, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.48046875, "step": 1092 }, { "completion_length": 158.4375, "epoch": 0.584804708400214, "grad_norm": 1.9407142400741577, "kl": 0.14126141369342804, "learning_rate": 4.86411719009502e-06, "loss": 0.0057, "reward": 1.74009370803833, "reward_std": 0.9432098269462585, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.39634376764297485, "step": 1093 }, { "completion_length": 146.65625, "epoch": 0.5853397538790798, "grad_norm": 0.7969549298286438, "kl": 0.15923042595386505, "learning_rate": 4.86361057301204e-06, "loss": 0.0064, "reward": 2.110875129699707, "reward_std": 1.0034128427505493, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4702500104904175, "step": 1094 }, { "completion_length": 133.34375, "epoch": 0.5858747993579454, "grad_norm": 328777728000.0, "kl": 2010634624.0, "learning_rate": 4.863103039748309e-06, "loss": 80425384.0, "reward": 2.0078125, "reward_std": 1.0450987815856934, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4765625, "step": 1095 }, { "completion_length": 134.5625, "epoch": 0.5864098448368111, "grad_norm": 1.9841277599334717, "kl": 0.18446558713912964, "learning_rate": 4.862594590500557e-06, "loss": 0.0074, "reward": 1.3502812385559082, "reward_std": 0.6823117733001709, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4127812385559082, "step": 1096 }, { "completion_length": 136.21875, "epoch": 0.5869448903156769, "grad_norm": 1.3361433744430542, "kl": 0.2493627965450287, "learning_rate": 4.862085225465869e-06, "loss": 0.01, "reward": 1.6888437271118164, "reward_std": 1.0379387140274048, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4544687569141388, "step": 1097 }, { "completion_length": 125.46875, "epoch": 0.5874799357945425, "grad_norm": 1.9568456411361694, "kl": 0.12660543620586395, "learning_rate": 4.861574944841683e-06, "loss": 0.0051, "reward": 2.225781202316284, "reward_std": 0.8077219724655151, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.44453126192092896, "step": 1098 }, { "completion_length": 120.96875, "epoch": 0.5880149812734082, "grad_norm": 1.3766450881958008, "kl": 0.19546398520469666, "learning_rate": 4.861063748825794e-06, "loss": 0.0078, "reward": 2.257500171661377, "reward_std": 0.717449426651001, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4762499928474426, "step": 1099 }, { "completion_length": 158.1875, "epoch": 0.588550026752274, "grad_norm": 0.706422746181488, "kl": 0.10302649438381195, "learning_rate": 4.860551637616352e-06, "loss": 0.0041, "reward": 1.296625018119812, "reward_std": 0.9579567909240723, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.3903749883174896, "step": 1100 }, { "completion_length": 148.6875, "epoch": 0.5890850722311396, "grad_norm": 3.8099961280822754, "kl": 0.1451481729745865, "learning_rate": 4.860038611411862e-06, "loss": 0.0058, "reward": 1.9230625629425049, "reward_std": 0.9699376821517944, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4230625033378601, "step": 1101 }, { "completion_length": 126.9375, "epoch": 0.5896201177100053, "grad_norm": 3.062288284301758, "kl": 0.1913975477218628, "learning_rate": 4.85952467041118e-06, "loss": 0.0077, "reward": 2.315281391143799, "reward_std": 1.0137503147125244, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4871562719345093, "step": 1102 }, { "completion_length": 146.65625, "epoch": 0.5901551631888711, "grad_norm": 0.906186580657959, "kl": 0.1697583645582199, "learning_rate": 4.859009814813522e-06, "loss": 0.0068, "reward": 2.266937494277954, "reward_std": 1.2305822372436523, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4700624942779541, "step": 1103 }, { "completion_length": 145.15625, "epoch": 0.5906902086677368, "grad_norm": 112.42255401611328, "kl": 20.76508903503418, "learning_rate": 4.858494044818455e-06, "loss": 0.8306, "reward": 1.2767187356948853, "reward_std": 0.6526585817337036, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.41734373569488525, "step": 1104 }, { "completion_length": 114.1875, "epoch": 0.5912252541466024, "grad_norm": 1.4906765222549438, "kl": 0.2627100348472595, "learning_rate": 4.857977360625901e-06, "loss": 0.0105, "reward": 1.8855937719345093, "reward_std": 0.8832166790962219, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4793437421321869, "step": 1105 }, { "completion_length": 153.5625, "epoch": 0.5917602996254682, "grad_norm": 1.1831406354904175, "kl": 0.12053786218166351, "learning_rate": 4.857459762436137e-06, "loss": 0.0048, "reward": 2.0282187461853027, "reward_std": 0.8642251491546631, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.45009374618530273, "step": 1106 }, { "completion_length": 125.125, "epoch": 0.5922953451043339, "grad_norm": 54.03129196166992, "kl": 2.0858314037323, "learning_rate": 4.856941250449795e-06, "loss": 0.0834, "reward": 2.2938437461853027, "reward_std": 1.0469787120819092, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48134374618530273, "step": 1107 }, { "completion_length": 133.28125, "epoch": 0.5928303905831995, "grad_norm": 515687808.0, "kl": 838117.375, "learning_rate": 4.856421824867858e-06, "loss": 33524.6992, "reward": 2.203125, "reward_std": 1.2589163780212402, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4375, "step": 1108 }, { "completion_length": 135.78125, "epoch": 0.5933654360620653, "grad_norm": 26790754.0, "kl": 3014705.0, "learning_rate": 4.855901485891668e-06, "loss": 120588.2188, "reward": 1.890625, "reward_std": 0.6575320363044739, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.453125, "step": 1109 }, { "completion_length": 145.5, "epoch": 0.593900481540931, "grad_norm": 10.507418632507324, "kl": 0.4508886933326721, "learning_rate": 4.855380233722915e-06, "loss": 0.018, "reward": 1.6450936794281006, "reward_std": 0.887839138507843, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.44196873903274536, "step": 1110 }, { "completion_length": 121.625, "epoch": 0.5944355270197966, "grad_norm": 1.2870452404022217, "kl": 0.24858596920967102, "learning_rate": 4.854858068563649e-06, "loss": 0.0099, "reward": 2.578125, "reward_std": 0.7215782403945923, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.5, "step": 1111 }, { "completion_length": 141.0625, "epoch": 0.5949705724986624, "grad_norm": 2.6830716133117676, "kl": 0.35975003242492676, "learning_rate": 4.854334990616271e-06, "loss": 0.0144, "reward": 1.9353437423706055, "reward_std": 0.9356533288955688, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.48221877217292786, "step": 1112 }, { "completion_length": 139.0625, "epoch": 0.5955056179775281, "grad_norm": 0.7711618542671204, "kl": 0.14475545287132263, "learning_rate": 4.8538110000835345e-06, "loss": 0.0058, "reward": 2.3046875, "reward_std": 0.7572566866874695, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4765625, "step": 1113 }, { "completion_length": 132.5625, "epoch": 0.5960406634563938, "grad_norm": 0.5737491846084595, "kl": 0.16078735888004303, "learning_rate": 4.853286097168549e-06, "loss": 0.0064, "reward": 2.1766250133514404, "reward_std": 0.8982473611831665, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.42662501335144043, "step": 1114 }, { "completion_length": 123.40625, "epoch": 0.5965757089352595, "grad_norm": 2.4518048763275146, "kl": 0.29048946499824524, "learning_rate": 4.852760282074778e-06, "loss": 0.0116, "reward": 2.2375311851501465, "reward_std": 1.0590269565582275, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.47190624475479126, "step": 1115 }, { "completion_length": 138.375, "epoch": 0.5971107544141252, "grad_norm": 1.2003726959228516, "kl": 0.13509199023246765, "learning_rate": 4.8522335550060366e-06, "loss": 0.0054, "reward": 2.460437297821045, "reward_std": 0.41581350564956665, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46043747663497925, "step": 1116 }, { "completion_length": 147.40625, "epoch": 0.5976457998929909, "grad_norm": 0.9796530604362488, "kl": 0.17400185763835907, "learning_rate": 4.851705916166494e-06, "loss": 0.007, "reward": 1.5952187776565552, "reward_std": 0.7163532972335815, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4233437478542328, "step": 1117 }, { "completion_length": 135.84375, "epoch": 0.5981808453718566, "grad_norm": 1.4077614545822144, "kl": 0.1544516384601593, "learning_rate": 4.851177365760673e-06, "loss": 0.0062, "reward": 1.6142187118530273, "reward_std": 0.9080516695976257, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.45796874165534973, "step": 1118 }, { "completion_length": 150.09375, "epoch": 0.5987158908507223, "grad_norm": 0.7276941537857056, "kl": 0.15462610125541687, "learning_rate": 4.850647903993451e-06, "loss": 0.0062, "reward": 1.9295625686645508, "reward_std": 0.6367148160934448, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.476437509059906, "step": 1119 }, { "completion_length": 140.46875, "epoch": 0.599250936329588, "grad_norm": 1.9607704877853394, "kl": 0.12936457991600037, "learning_rate": 4.850117531070057e-06, "loss": 0.0052, "reward": 2.171375036239624, "reward_std": 0.6747731566429138, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46824997663497925, "step": 1120 }, { "completion_length": 166.0, "epoch": 0.5997859818084538, "grad_norm": 0.9254263043403625, "kl": 0.1538991630077362, "learning_rate": 4.849586247196073e-06, "loss": 0.0062, "reward": 1.3611249923706055, "reward_std": 1.0023552179336548, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.234375, "rewards/xmlcount_reward_func": 0.34549999237060547, "step": 1121 }, { "completion_length": 117.0625, "epoch": 0.6003210272873194, "grad_norm": 1.5297528505325317, "kl": 0.1720333695411682, "learning_rate": 4.8490540525774356e-06, "loss": 0.0069, "reward": 1.62890625, "reward_std": 0.5869995951652527, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48828125, "step": 1122 }, { "completion_length": 145.75, "epoch": 0.6008560727661851, "grad_norm": 7.573436260223389, "kl": 0.18289348483085632, "learning_rate": 4.848520947420433e-06, "loss": 0.0073, "reward": 1.249843716621399, "reward_std": 0.539472222328186, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4217187464237213, "step": 1123 }, { "completion_length": 166.9375, "epoch": 0.6013911182450509, "grad_norm": 2.0861809253692627, "kl": 0.14075373113155365, "learning_rate": 4.8479869319317076e-06, "loss": 0.0056, "reward": 1.842437505722046, "reward_std": 1.213951587677002, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.3893125057220459, "step": 1124 }, { "completion_length": 146.03125, "epoch": 0.6019261637239165, "grad_norm": 2.1365323066711426, "kl": 0.32203227281570435, "learning_rate": 4.847452006318254e-06, "loss": 0.0129, "reward": 1.5828750133514404, "reward_std": 0.7933632135391235, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.44225001335144043, "step": 1125 }, { "completion_length": 146.96875, "epoch": 0.6024612092027822, "grad_norm": 180752400.0, "kl": 661927.875, "learning_rate": 4.846916170787419e-06, "loss": 26477.1152, "reward": 1.7147186994552612, "reward_std": 0.8099848031997681, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.417843759059906, "step": 1126 }, { "completion_length": 150.125, "epoch": 0.602996254681648, "grad_norm": 1.5868642330169678, "kl": 0.17907682061195374, "learning_rate": 4.846379425546904e-06, "loss": 0.0072, "reward": 1.92578125, "reward_std": 0.6235758066177368, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.48828125, "step": 1127 }, { "completion_length": 173.25, "epoch": 0.6035313001605136, "grad_norm": 1.254693627357483, "kl": 0.13918109238147736, "learning_rate": 4.84584177080476e-06, "loss": 0.0056, "reward": 1.0245312452316284, "reward_std": 0.8269187808036804, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.234375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.3682812452316284, "step": 1128 }, { "completion_length": 137.71875, "epoch": 0.6040663456393793, "grad_norm": 0.916369616985321, "kl": 0.13491305708885193, "learning_rate": 4.845303206769394e-06, "loss": 0.0054, "reward": 1.7646875381469727, "reward_std": 0.7088865041732788, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.45218750834465027, "step": 1129 }, { "completion_length": 139.09375, "epoch": 0.6046013911182451, "grad_norm": 3.583649158477783, "kl": 0.2170032560825348, "learning_rate": 4.844763733649563e-06, "loss": 0.0087, "reward": 1.6181249618530273, "reward_std": 0.9838747978210449, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.234375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4462500214576721, "step": 1130 }, { "completion_length": 127.53125, "epoch": 0.6051364365971108, "grad_norm": 1.8583606481552124, "kl": 0.180629700422287, "learning_rate": 4.844223351654376e-06, "loss": 0.0072, "reward": 2.018843650817871, "reward_std": 0.6629297733306885, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.48759374022483826, "step": 1131 }, { "completion_length": 138.59375, "epoch": 0.6056714820759764, "grad_norm": 8.995025634765625, "kl": 0.16382959485054016, "learning_rate": 4.8436820609932965e-06, "loss": 0.0066, "reward": 1.9490938186645508, "reward_std": 1.0377923250198364, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.464718759059906, "step": 1132 }, { "completion_length": 133.65625, "epoch": 0.6062065275548422, "grad_norm": 3108.1748046875, "kl": 4.077020168304443, "learning_rate": 4.84313986187614e-06, "loss": 0.1631, "reward": 1.8046875, "reward_std": 0.4207439422607422, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4765625, "step": 1133 }, { "completion_length": 126.28125, "epoch": 0.6067415730337079, "grad_norm": 0.766920268535614, "kl": 0.17313995957374573, "learning_rate": 4.842596754513072e-06, "loss": 0.0069, "reward": 1.5945937633514404, "reward_std": 0.6447077989578247, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.48521876335144043, "step": 1134 }, { "completion_length": 151.84375, "epoch": 0.6072766185125735, "grad_norm": 1.2857252359390259, "kl": 0.15743063390254974, "learning_rate": 4.842052739114612e-06, "loss": 0.0063, "reward": 1.3024063110351562, "reward_std": 0.8078908324241638, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.25, "rewards/xmlcount_reward_func": 0.4117812514305115, "step": 1135 }, { "completion_length": 136.15625, "epoch": 0.6078116639914393, "grad_norm": 202.1629180908203, "kl": 0.4455593526363373, "learning_rate": 4.8415078158916295e-06, "loss": 0.0178, "reward": 1.639968752861023, "reward_std": 0.8352842330932617, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.43684375286102295, "step": 1136 }, { "completion_length": 159.75, "epoch": 0.608346709470305, "grad_norm": 0.7579218745231628, "kl": 0.1569453924894333, "learning_rate": 4.840961985055349e-06, "loss": 0.0063, "reward": 1.8006250858306885, "reward_std": 0.7113224864006042, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4412499964237213, "step": 1137 }, { "completion_length": 122.78125, "epoch": 0.6088817549491706, "grad_norm": 1.2823237180709839, "kl": 0.188606858253479, "learning_rate": 4.8404152468173435e-06, "loss": 0.0075, "reward": 2.0655312538146973, "reward_std": 0.575480043888092, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48740625381469727, "step": 1138 }, { "completion_length": 141.4375, "epoch": 0.6094168004280364, "grad_norm": 0.6547555923461914, "kl": 0.15681405365467072, "learning_rate": 4.839867601389541e-06, "loss": 0.0063, "reward": 2.0287187099456787, "reward_std": 0.6054452657699585, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4349687695503235, "step": 1139 }, { "completion_length": 126.53125, "epoch": 0.6099518459069021, "grad_norm": 0.9071377515792847, "kl": 0.21179160475730896, "learning_rate": 4.839319048984218e-06, "loss": 0.0085, "reward": 1.749843716621399, "reward_std": 0.7684418559074402, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4842187762260437, "step": 1140 }, { "completion_length": 118.65625, "epoch": 0.6104868913857678, "grad_norm": 1.7367770671844482, "kl": 0.20118066668510437, "learning_rate": 4.838769589814003e-06, "loss": 0.008, "reward": 2.53125, "reward_std": 1.0668096542358398, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.5, "step": 1141 }, { "completion_length": 121.1875, "epoch": 0.6110219368646335, "grad_norm": 1.689017653465271, "kl": 0.24749423563480377, "learning_rate": 4.8382192240918785e-06, "loss": 0.0099, "reward": 1.8294062614440918, "reward_std": 0.6611422300338745, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4856562614440918, "step": 1142 }, { "completion_length": 130.625, "epoch": 0.6115569823434992, "grad_norm": 1.7710204124450684, "kl": 0.21485115587711334, "learning_rate": 4.837667952031177e-06, "loss": 0.0086, "reward": 1.3502812385559082, "reward_std": 0.4595765173435211, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4596562385559082, "step": 1143 }, { "completion_length": 129.09375, "epoch": 0.6120920278223649, "grad_norm": 0.5750011205673218, "kl": 0.1932755708694458, "learning_rate": 4.837115773845581e-06, "loss": 0.0077, "reward": 2.639031410217285, "reward_std": 0.6640337705612183, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4671562612056732, "step": 1144 }, { "completion_length": 140.5, "epoch": 0.6126270733012306, "grad_norm": 0.4417639374732971, "kl": 0.14353258907794952, "learning_rate": 4.836562689749126e-06, "loss": 0.0057, "reward": 1.7255938053131104, "reward_std": 0.9569124579429626, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4599687457084656, "step": 1145 }, { "completion_length": 130.5, "epoch": 0.6131621187800963, "grad_norm": 0.9119896292686462, "kl": 0.16175028681755066, "learning_rate": 4.836008699956197e-06, "loss": 0.0065, "reward": 2.5915937423706055, "reward_std": 0.9438003897666931, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48221877217292786, "step": 1146 }, { "completion_length": 133.21875, "epoch": 0.613697164258962, "grad_norm": 0.6068432331085205, "kl": 0.19329026341438293, "learning_rate": 4.835453804681532e-06, "loss": 0.0077, "reward": 2.3273749351501465, "reward_std": 0.9994510412216187, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.49924999475479126, "step": 1147 }, { "completion_length": 145.53125, "epoch": 0.6142322097378277, "grad_norm": 1.117330551147461, "kl": 0.18292807042598724, "learning_rate": 4.8348980041402196e-06, "loss": 0.0073, "reward": 2.1047186851501465, "reward_std": 0.31390172243118286, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.40159374475479126, "step": 1148 }, { "completion_length": 127.90625, "epoch": 0.6147672552166934, "grad_norm": 1.6199867725372314, "kl": 0.201051726937294, "learning_rate": 4.8343412985476976e-06, "loss": 0.008, "reward": 1.7243125438690186, "reward_std": 0.5896052122116089, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.45868751406669617, "step": 1149 }, { "completion_length": 126.25, "epoch": 0.6153023006955591, "grad_norm": 20381.900390625, "kl": 98.32941436767578, "learning_rate": 4.833783688119757e-06, "loss": 3.9332, "reward": 1.376312494277954, "reward_std": 0.6588824987411499, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4700624942779541, "step": 1150 }, { "completion_length": 126.125, "epoch": 0.6158373461744249, "grad_norm": 0.8533769249916077, "kl": 0.2183271050453186, "learning_rate": 4.833225173072537e-06, "loss": 0.0087, "reward": 1.6213124990463257, "reward_std": 0.4302343726158142, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4650624990463257, "step": 1151 }, { "completion_length": 158.96875, "epoch": 0.6163723916532905, "grad_norm": 15.22490406036377, "kl": 0.8183212280273438, "learning_rate": 4.8326657536225295e-06, "loss": 0.0327, "reward": 1.5835000276565552, "reward_std": 1.026379108428955, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.3803749978542328, "step": 1152 }, { "completion_length": 132.53125, "epoch": 0.6169074371321562, "grad_norm": 2.2908101081848145, "kl": 0.16376148164272308, "learning_rate": 4.832105429986576e-06, "loss": 0.0066, "reward": 1.1878437995910645, "reward_std": 0.4213430881500244, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.45346879959106445, "step": 1153 }, { "completion_length": 138.65625, "epoch": 0.617442482611022, "grad_norm": 4.919558048248291, "kl": 0.5972936153411865, "learning_rate": 4.831544202381867e-06, "loss": 0.0239, "reward": 1.5345938205718994, "reward_std": 0.7987536191940308, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.44084376096725464, "step": 1154 }, { "completion_length": 146.4375, "epoch": 0.6179775280898876, "grad_norm": 120.53196716308594, "kl": 9.114461898803711, "learning_rate": 4.830982071025948e-06, "loss": 0.3646, "reward": 2.04296875, "reward_std": 1.3140629529953003, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.44921875, "step": 1155 }, { "completion_length": 129.09375, "epoch": 0.6185125735687533, "grad_norm": 2.3227508068084717, "kl": 0.1891220062971115, "learning_rate": 4.830419036136711e-06, "loss": 0.0076, "reward": 2.558468818664551, "reward_std": 0.6928333640098572, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.495968759059906, "step": 1156 }, { "completion_length": 127.375, "epoch": 0.6190476190476191, "grad_norm": 1.5873818397521973, "kl": 0.1909523606300354, "learning_rate": 4.829855097932398e-06, "loss": 0.0076, "reward": 2.5511562824249268, "reward_std": 0.9500662088394165, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.44178125262260437, "step": 1157 }, { "completion_length": 144.09375, "epoch": 0.6195826645264848, "grad_norm": 0.9471921324729919, "kl": 0.12336783111095428, "learning_rate": 4.829290256631603e-06, "loss": 0.0049, "reward": 1.2240312099456787, "reward_std": 0.7646522521972656, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.3959062695503235, "step": 1158 }, { "completion_length": 130.46875, "epoch": 0.6201177100053504, "grad_norm": 2.440208673477173, "kl": 0.22307881712913513, "learning_rate": 4.8287245124532685e-06, "loss": 0.0089, "reward": 1.9790937900543213, "reward_std": 0.5552003979682922, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4790937602519989, "step": 1159 }, { "completion_length": 173.125, "epoch": 0.6206527554842162, "grad_norm": 2.692676544189453, "kl": 0.1461736559867859, "learning_rate": 4.82815786561669e-06, "loss": 0.0058, "reward": 1.1647812128067017, "reward_std": 0.7123895883560181, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.28125, "rewards/xmlcount_reward_func": 0.35228124260902405, "step": 1160 }, { "completion_length": 147.84375, "epoch": 0.6211878009630819, "grad_norm": 0.6114971041679382, "kl": 0.16168490052223206, "learning_rate": 4.827590316341509e-06, "loss": 0.0065, "reward": 1.8048124313354492, "reward_std": 0.5947219133377075, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.492312490940094, "step": 1161 }, { "completion_length": 142.5, "epoch": 0.6217228464419475, "grad_norm": 1.8644155263900757, "kl": 0.17539432644844055, "learning_rate": 4.827021864847718e-06, "loss": 0.007, "reward": 2.269218921661377, "reward_std": 0.802358865737915, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4567187428474426, "step": 1162 }, { "completion_length": 124.21875, "epoch": 0.6222578919208133, "grad_norm": 3.3469784259796143, "kl": 0.24128150939941406, "learning_rate": 4.8264525113556595e-06, "loss": 0.0097, "reward": 2.0734686851501465, "reward_std": 0.8929871320724487, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46409374475479126, "step": 1163 }, { "completion_length": 124.8125, "epoch": 0.622792937399679, "grad_norm": 2.025587797164917, "kl": 0.1711237132549286, "learning_rate": 4.825882256086028e-06, "loss": 0.0068, "reward": 1.7297186851501465, "reward_std": 0.6156458854675293, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.49534374475479126, "step": 1164 }, { "completion_length": 150.375, "epoch": 0.6233279828785446, "grad_norm": 1.215242624282837, "kl": 0.16431397199630737, "learning_rate": 4.825311099259864e-06, "loss": 0.0066, "reward": 2.0175623893737793, "reward_std": 1.0027151107788086, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.39256250858306885, "step": 1165 }, { "completion_length": 129.625, "epoch": 0.6238630283574104, "grad_norm": 1.1859614849090576, "kl": 0.1723012924194336, "learning_rate": 4.8247390410985584e-06, "loss": 0.0069, "reward": 1.911250114440918, "reward_std": 0.7064253091812134, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4737499952316284, "step": 1166 }, { "completion_length": 151.21875, "epoch": 0.6243980738362761, "grad_norm": 2.775236129760742, "kl": 0.12084149569272995, "learning_rate": 4.824166081823853e-06, "loss": 0.0048, "reward": 1.9757812023162842, "reward_std": 0.9531334638595581, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.42890626192092896, "step": 1167 }, { "completion_length": 119.9375, "epoch": 0.6249331193151418, "grad_norm": 2.4075491428375244, "kl": 0.18401175737380981, "learning_rate": 4.823592221657837e-06, "loss": 0.0074, "reward": 2.859375, "reward_std": 0.9539798498153687, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.5, "step": 1168 }, { "completion_length": 150.5625, "epoch": 0.6254681647940075, "grad_norm": 2.014112710952759, "kl": 0.15727856755256653, "learning_rate": 4.82301746082295e-06, "loss": 0.0063, "reward": 1.811593770980835, "reward_std": 0.8823353052139282, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4209687411785126, "step": 1169 }, { "completion_length": 127.0, "epoch": 0.6260032102728732, "grad_norm": 1.965189814567566, "kl": 0.17344443500041962, "learning_rate": 4.822441799541979e-06, "loss": 0.0069, "reward": 2.1707186698913574, "reward_std": 1.035498857498169, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4832187592983246, "step": 1170 }, { "completion_length": 148.90625, "epoch": 0.6265382557517389, "grad_norm": 0.7572735548019409, "kl": 0.1470494568347931, "learning_rate": 4.821865238038063e-06, "loss": 0.0059, "reward": 1.473562479019165, "reward_std": 0.6787317991256714, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4423125088214874, "step": 1171 }, { "completion_length": 179.0625, "epoch": 0.6270733012306046, "grad_norm": 41.420082092285156, "kl": 0.6404052972793579, "learning_rate": 4.8212877765346875e-06, "loss": 0.0256, "reward": 1.0611250400543213, "reward_std": 1.2107799053192139, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.21875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.15625, "rewards/xmlcount_reward_func": 0.3111249804496765, "step": 1172 }, { "completion_length": 160.0625, "epoch": 0.6276083467094703, "grad_norm": 1.6151316165924072, "kl": 0.12278712540864944, "learning_rate": 4.820709415255689e-06, "loss": 0.0049, "reward": 1.4396562576293945, "reward_std": 0.8303881883621216, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.39278125762939453, "step": 1173 }, { "completion_length": 140.84375, "epoch": 0.628143392188336, "grad_norm": 119569032.0, "kl": 151485.6875, "learning_rate": 4.820130154425249e-06, "loss": 6059.4292, "reward": 1.2657499313354492, "reward_std": 0.6728529334068298, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.406374990940094, "step": 1174 }, { "completion_length": 134.875, "epoch": 0.6286784376672017, "grad_norm": 0.7459377646446228, "kl": 0.15219177305698395, "learning_rate": 4.819549994267902e-06, "loss": 0.0061, "reward": 2.53125, "reward_std": 0.5253888964653015, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46875, "step": 1175 }, { "completion_length": 157.46875, "epoch": 0.6292134831460674, "grad_norm": 2.494032382965088, "kl": 0.29221582412719727, "learning_rate": 4.818968935008529e-06, "loss": 0.0117, "reward": 1.3904688358306885, "reward_std": 0.87603759765625, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4217187762260437, "step": 1176 }, { "completion_length": 133.53125, "epoch": 0.6297485286249331, "grad_norm": 4070.636474609375, "kl": 38.32822036743164, "learning_rate": 4.818386976872359e-06, "loss": 1.5331, "reward": 2.415781259536743, "reward_std": 0.7497556805610657, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47828125953674316, "step": 1177 }, { "completion_length": 159.90625, "epoch": 0.6302835741037989, "grad_norm": 6.320382118225098, "kl": 0.12401972711086273, "learning_rate": 4.8178041200849705e-06, "loss": 0.005, "reward": 1.7753437757492065, "reward_std": 1.0619139671325684, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.41596874594688416, "step": 1178 }, { "completion_length": 141.46875, "epoch": 0.6308186195826645, "grad_norm": 0.9421367049217224, "kl": 0.1688237488269806, "learning_rate": 4.817220364872289e-06, "loss": 0.0068, "reward": 1.8984375, "reward_std": 0.76877760887146, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4609375, "step": 1179 }, { "completion_length": 136.25, "epoch": 0.6313536650615302, "grad_norm": 645781120.0, "kl": 21406512.0, "learning_rate": 4.816635711460591e-06, "loss": 856260.5, "reward": 2.2739062309265137, "reward_std": 1.0411639213562012, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.43015623092651367, "step": 1180 }, { "completion_length": 123.3125, "epoch": 0.631888710540396, "grad_norm": 1551.2061767578125, "kl": 7.045471668243408, "learning_rate": 4.816050160076497e-06, "loss": 0.2818, "reward": 2.6846251487731934, "reward_std": 0.625637948513031, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4814999997615814, "step": 1181 }, { "completion_length": 145.0, "epoch": 0.6324237560192616, "grad_norm": 2.8764100074768066, "kl": 0.18298932909965515, "learning_rate": 4.815463710946982e-06, "loss": 0.0073, "reward": 1.3426562547683716, "reward_std": 0.6258293986320496, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.3895312547683716, "step": 1182 }, { "completion_length": 147.28125, "epoch": 0.6329588014981273, "grad_norm": 0.9924288392066956, "kl": 0.16196587681770325, "learning_rate": 4.81487636429936e-06, "loss": 0.0065, "reward": 1.739687442779541, "reward_std": 0.7821372747421265, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4896875023841858, "step": 1183 }, { "completion_length": 160.5625, "epoch": 0.6334938469769931, "grad_norm": 7526326.5, "kl": 88503.921875, "learning_rate": 4.814288120361301e-06, "loss": 3540.1567, "reward": 1.1808124780654907, "reward_std": 0.63407963514328, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4151875078678131, "step": 1184 }, { "completion_length": 128.4375, "epoch": 0.6340288924558587, "grad_norm": 0.499479740858078, "kl": 0.17189571261405945, "learning_rate": 4.813698979360819e-06, "loss": 0.0069, "reward": 2.9234061241149902, "reward_std": 0.5625007152557373, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4859062433242798, "step": 1185 }, { "completion_length": 122.59375, "epoch": 0.6345639379347244, "grad_norm": 0.7635049223899841, "kl": 0.168272003531456, "learning_rate": 4.813108941526276e-06, "loss": 0.0067, "reward": 1.9660624265670776, "reward_std": 0.7090473175048828, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4660624861717224, "step": 1186 }, { "completion_length": 141.09375, "epoch": 0.6350989834135902, "grad_norm": 0.9496634006500244, "kl": 0.16982173919677734, "learning_rate": 4.812518007086381e-06, "loss": 0.0068, "reward": 2.1537814140319824, "reward_std": 1.0883796215057373, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4662812650203705, "step": 1187 }, { "completion_length": 130.03125, "epoch": 0.6356340288924559, "grad_norm": 144.0072784423828, "kl": 2.722853183746338, "learning_rate": 4.811926176270194e-06, "loss": 0.1089, "reward": 1.96875, "reward_std": 0.8769270181655884, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.453125, "step": 1188 }, { "completion_length": 146.71875, "epoch": 0.6361690743713215, "grad_norm": 1.2295737266540527, "kl": 0.1455611288547516, "learning_rate": 4.811333449307118e-06, "loss": 0.0058, "reward": 1.42578125, "reward_std": 0.49016010761260986, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48828125, "step": 1189 }, { "completion_length": 151.8125, "epoch": 0.6367041198501873, "grad_norm": 0.748722493648529, "kl": 0.1257031112909317, "learning_rate": 4.810739826426905e-06, "loss": 0.005, "reward": 1.9574999809265137, "reward_std": 1.1216074228286743, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.44187498092651367, "step": 1190 }, { "completion_length": 152.28125, "epoch": 0.637239165329053, "grad_norm": 0.7136113047599792, "kl": 0.11753620952367783, "learning_rate": 4.810145307859656e-06, "loss": 0.0047, "reward": 2.2612500190734863, "reward_std": 1.3077993392944336, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.43312501907348633, "step": 1191 }, { "completion_length": 112.78125, "epoch": 0.6377742108079186, "grad_norm": 0.7448825240135193, "kl": 0.22802947461605072, "learning_rate": 4.809549893835817e-06, "loss": 0.0091, "reward": 2.561687469482422, "reward_std": 0.9114301204681396, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.49918749928474426, "step": 1192 }, { "completion_length": 141.65625, "epoch": 0.6383092562867844, "grad_norm": 1.4912089109420776, "kl": 0.23154151439666748, "learning_rate": 4.8089535845861834e-06, "loss": 0.0093, "reward": 1.7616875171661377, "reward_std": 0.8668724298477173, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4491875171661377, "step": 1193 }, { "completion_length": 130.59375, "epoch": 0.6388443017656501, "grad_norm": 1.104648470878601, "kl": 0.16060708463191986, "learning_rate": 4.808356380341894e-06, "loss": 0.0064, "reward": 2.18359375, "reward_std": 0.8132408261299133, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48046875, "step": 1194 }, { "completion_length": 146.4375, "epoch": 0.6393793472445158, "grad_norm": 2.2704970836639404, "kl": 0.23385748267173767, "learning_rate": 4.807758281334438e-06, "loss": 0.0094, "reward": 1.535312533378601, "reward_std": 0.5960267782211304, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4571875035762787, "step": 1195 }, { "completion_length": 142.84375, "epoch": 0.6399143927233815, "grad_norm": 0.9134326577186584, "kl": 0.12554913759231567, "learning_rate": 4.807159287795648e-06, "loss": 0.005, "reward": 1.785406231880188, "reward_std": 0.7933823466300964, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.457281231880188, "step": 1196 }, { "completion_length": 156.3125, "epoch": 0.6404494382022472, "grad_norm": 0.7649937272071838, "kl": 0.1538558006286621, "learning_rate": 4.8065593999577085e-06, "loss": 0.0062, "reward": 1.1220312118530273, "reward_std": 0.6731332540512085, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.38765624165534973, "step": 1197 }, { "completion_length": 115.40625, "epoch": 0.6409844836811129, "grad_norm": 1.3331812620162964, "kl": 0.2370794415473938, "learning_rate": 4.805958618053144e-06, "loss": 0.0095, "reward": 2.9150938987731934, "reward_std": 0.841652512550354, "rewards/correctness_reward_func": 1.5625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4775937497615814, "step": 1198 }, { "completion_length": 155.1875, "epoch": 0.6415195291599786, "grad_norm": 0.9967899918556213, "kl": 0.15661779046058655, "learning_rate": 4.805356942314833e-06, "loss": 0.0063, "reward": 1.890625, "reward_std": 0.9182561635971069, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.453125, "step": 1199 }, { "completion_length": 133.0625, "epoch": 0.6420545746388443, "grad_norm": 1.2674229145050049, "kl": 0.2370433658361435, "learning_rate": 4.804754372975994e-06, "loss": 0.0095, "reward": 2.2066562175750732, "reward_std": 0.6854029893875122, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.45665624737739563, "step": 1200 }, { "completion_length": 134.875, "epoch": 0.64258962011771, "grad_norm": 3.7003965377807617, "kl": 0.32111939787864685, "learning_rate": 4.804150910270195e-06, "loss": 0.0128, "reward": 2.141406297683716, "reward_std": 0.9018888473510742, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.45390626788139343, "step": 1201 }, { "completion_length": 159.375, "epoch": 0.6431246655965757, "grad_norm": 1418.356201171875, "kl": 17.73797035217285, "learning_rate": 4.803546554431349e-06, "loss": 0.7095, "reward": 1.9634062051773071, "reward_std": 1.13246488571167, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4321562647819519, "step": 1202 }, { "completion_length": 139.0625, "epoch": 0.6436597110754414, "grad_norm": 5.707177639007568, "kl": 0.47430655360221863, "learning_rate": 4.802941305693716e-06, "loss": 0.019, "reward": 2.104875087738037, "reward_std": 0.977271556854248, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46424999833106995, "step": 1203 }, { "completion_length": 151.25, "epoch": 0.6441947565543071, "grad_norm": 1.33836030960083, "kl": 0.13024461269378662, "learning_rate": 4.8023351642919035e-06, "loss": 0.0052, "reward": 1.82421875, "reward_std": 0.7760835886001587, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.46484375, "step": 1204 }, { "completion_length": 147.78125, "epoch": 0.6447298020331729, "grad_norm": 57.74516296386719, "kl": 0.7118721604347229, "learning_rate": 4.8017281304608625e-06, "loss": 0.0285, "reward": 1.5786874294281006, "reward_std": 0.7476365566253662, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46931248903274536, "step": 1205 }, { "completion_length": 132.90625, "epoch": 0.6452648475120385, "grad_norm": 1.4421820640563965, "kl": 0.24648597836494446, "learning_rate": 4.8011202044358905e-06, "loss": 0.0099, "reward": 2.6088438034057617, "reward_std": 0.6271993517875671, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46821874380111694, "step": 1206 }, { "completion_length": 118.96875, "epoch": 0.6457998929909042, "grad_norm": 0.8652364611625671, "kl": 0.18512685596942902, "learning_rate": 4.800511386452632e-06, "loss": 0.0074, "reward": 2.479687452316284, "reward_std": 0.6848270893096924, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.49531251192092896, "step": 1207 }, { "completion_length": 145.71875, "epoch": 0.64633493846977, "grad_norm": 1.461135983467102, "kl": 0.14789557456970215, "learning_rate": 4.799901676747076e-06, "loss": 0.0059, "reward": 1.546875, "reward_std": 0.8315116763114929, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.453125, "step": 1208 }, { "completion_length": 129.84375, "epoch": 0.6468699839486356, "grad_norm": 17.336654663085938, "kl": 1.2091501951217651, "learning_rate": 4.799291075555559e-06, "loss": 0.0484, "reward": 2.2373125553131104, "reward_std": 0.9644784927368164, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4716874957084656, "step": 1209 }, { "completion_length": 137.34375, "epoch": 0.6474050294275013, "grad_norm": 1.4788881540298462, "kl": 0.1878768652677536, "learning_rate": 4.79867958311476e-06, "loss": 0.0075, "reward": 2.1118125915527344, "reward_std": 1.0761387348175049, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.3930624723434448, "step": 1210 }, { "completion_length": 128.15625, "epoch": 0.6479400749063671, "grad_norm": 1.420641303062439, "kl": 0.24255457520484924, "learning_rate": 4.7980671996617055e-06, "loss": 0.0097, "reward": 2.3788437843322754, "reward_std": 1.0896981954574585, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.488218754529953, "step": 1211 }, { "completion_length": 139.625, "epoch": 0.6484751203852327, "grad_norm": 8.53813362121582, "kl": 0.17805486917495728, "learning_rate": 4.797453925433768e-06, "loss": 0.0071, "reward": 2.0874061584472656, "reward_std": 0.4725588858127594, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4624062478542328, "step": 1212 }, { "completion_length": 170.75, "epoch": 0.6490101658640984, "grad_norm": 6.79552698135376, "kl": 0.185313880443573, "learning_rate": 4.796839760668664e-06, "loss": 0.0074, "reward": 0.9356250762939453, "reward_std": 0.5729304552078247, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.28125, "rewards/xmlcount_reward_func": 0.34187501668930054, "step": 1213 }, { "completion_length": 143.15625, "epoch": 0.6495452113429642, "grad_norm": 1.889217495918274, "kl": 0.20605501532554626, "learning_rate": 4.796224705604454e-06, "loss": 0.0082, "reward": 1.725250005722046, "reward_std": 0.9514451026916504, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.4440000057220459, "step": 1214 }, { "completion_length": 105.875, "epoch": 0.6500802568218299, "grad_norm": 20630.7109375, "kl": 3845.8896484375, "learning_rate": 4.795608760479548e-06, "loss": 153.8356, "reward": 2.221437454223633, "reward_std": 0.4113546311855316, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4870625138282776, "step": 1215 }, { "completion_length": 132.0, "epoch": 0.6506153023006955, "grad_norm": 1.8097107410430908, "kl": 0.2700527310371399, "learning_rate": 4.794991925532696e-06, "loss": 0.0108, "reward": 2.2108750343322754, "reward_std": 0.9450124502182007, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.460875004529953, "step": 1216 }, { "completion_length": 109.6875, "epoch": 0.6511503477795613, "grad_norm": 1.2252389192581177, "kl": 0.4120093584060669, "learning_rate": 4.794374201002995e-06, "loss": 0.0165, "reward": 2.4015936851501465, "reward_std": 0.9123786091804504, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.49534374475479126, "step": 1217 }, { "completion_length": 136.375, "epoch": 0.651685393258427, "grad_norm": 1.2586357593536377, "kl": 0.1607692539691925, "learning_rate": 4.793755587129889e-06, "loss": 0.0064, "reward": 2.56640625, "reward_std": 1.1033637523651123, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.48828125, "step": 1218 }, { "completion_length": 151.96875, "epoch": 0.6522204387372926, "grad_norm": 0.4250779151916504, "kl": 0.15094450116157532, "learning_rate": 4.793136084153161e-06, "loss": 0.006, "reward": 1.6484375, "reward_std": 0.8794869184494019, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4765625, "step": 1219 }, { "completion_length": 129.84375, "epoch": 0.6527554842161584, "grad_norm": 1.0216773748397827, "kl": 0.23787979781627655, "learning_rate": 4.7925156923129465e-06, "loss": 0.0095, "reward": 1.6796875, "reward_std": 0.6324169039726257, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4765625, "step": 1220 }, { "completion_length": 121.21875, "epoch": 0.6532905296950241, "grad_norm": 1.7553938627243042, "kl": 0.25412386655807495, "learning_rate": 4.791894411849718e-06, "loss": 0.0102, "reward": 2.6486563682556152, "reward_std": 0.5360500812530518, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4767812490463257, "step": 1221 }, { "completion_length": 136.5, "epoch": 0.6538255751738897, "grad_norm": 115.80778503417969, "kl": 2.967986822128296, "learning_rate": 4.7912722430042966e-06, "loss": 0.1187, "reward": 1.3584063053131104, "reward_std": 0.8145732283592224, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.4052812457084656, "step": 1222 }, { "completion_length": 116.9375, "epoch": 0.6543606206527555, "grad_norm": 1.386365294456482, "kl": 0.24809084832668304, "learning_rate": 4.790649186017847e-06, "loss": 0.0099, "reward": 2.1089062690734863, "reward_std": 1.0412297248840332, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48390626907348633, "step": 1223 }, { "completion_length": 147.875, "epoch": 0.6548956661316212, "grad_norm": 8.828824996948242, "kl": 0.15759867429733276, "learning_rate": 4.790025241131878e-06, "loss": 0.0063, "reward": 1.5, "reward_std": 0.7017809748649597, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46875, "step": 1224 }, { "completion_length": 143.59375, "epoch": 0.6554307116104869, "grad_norm": 9.349348068237305, "kl": 0.19655528664588928, "learning_rate": 4.789400408588243e-06, "loss": 0.0079, "reward": 1.6850311756134033, "reward_std": 0.746795654296875, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.3725312352180481, "step": 1225 }, { "completion_length": 171.28125, "epoch": 0.6559657570893526, "grad_norm": 0.6500729918479919, "kl": 0.1185685396194458, "learning_rate": 4.788774688629138e-06, "loss": 0.0047, "reward": 1.44921875, "reward_std": 0.9169615507125854, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.38671875, "step": 1226 }, { "completion_length": 134.5, "epoch": 0.6565008025682183, "grad_norm": 3.0429084300994873, "kl": 0.19739797711372375, "learning_rate": 4.788148081497107e-06, "loss": 0.0079, "reward": 1.7253124713897705, "reward_std": 0.8105577230453491, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4596875011920929, "step": 1227 }, { "completion_length": 153.03125, "epoch": 0.657035848047084, "grad_norm": 3.6865108013153076, "kl": 0.1770908534526825, "learning_rate": 4.787520587435031e-06, "loss": 0.0071, "reward": 1.8238437175750732, "reward_std": 1.0693519115447998, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.41759374737739563, "step": 1228 }, { "completion_length": 128.15625, "epoch": 0.6575708935259497, "grad_norm": 0.8804013133049011, "kl": 0.1825387179851532, "learning_rate": 4.7868922066861405e-06, "loss": 0.0073, "reward": 2.4297187328338623, "reward_std": 1.1285167932510376, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4922187328338623, "step": 1229 }, { "completion_length": 143.8125, "epoch": 0.6581059390048154, "grad_norm": 9.438017845153809, "kl": 0.1786160171031952, "learning_rate": 4.786262939494007e-06, "loss": 0.0071, "reward": 2.013218879699707, "reward_std": 0.806015133857727, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4507187306880951, "step": 1230 }, { "completion_length": 130.3125, "epoch": 0.6586409844836811, "grad_norm": 1.4901360273361206, "kl": 0.18162110447883606, "learning_rate": 4.785632786102549e-06, "loss": 0.0073, "reward": 2.0975937843322754, "reward_std": 0.8490539789199829, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.488218754529953, "step": 1231 }, { "completion_length": 139.25, "epoch": 0.6591760299625468, "grad_norm": 1.0007619857788086, "kl": 0.13919027149677277, "learning_rate": 4.785001746756024e-06, "loss": 0.0056, "reward": 1.6307499408721924, "reward_std": 0.8522399663925171, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.44325000047683716, "step": 1232 }, { "completion_length": 139.59375, "epoch": 0.6597110754414125, "grad_norm": 2.3760950565338135, "kl": 0.20622287690639496, "learning_rate": 4.784369821699035e-06, "loss": 0.0082, "reward": 1.8602187633514404, "reward_std": 0.8080642223358154, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.46959376335144043, "step": 1233 }, { "completion_length": 133.21875, "epoch": 0.6602461209202782, "grad_norm": 0.7770658135414124, "kl": 0.17015887796878815, "learning_rate": 4.7837370111765294e-06, "loss": 0.0068, "reward": 1.7734375, "reward_std": 0.6330854296684265, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4765625, "step": 1234 }, { "completion_length": 123.375, "epoch": 0.660781166399144, "grad_norm": 437796.6875, "kl": 970.88037109375, "learning_rate": 4.783103315433795e-06, "loss": 38.8352, "reward": 2.7687811851501465, "reward_std": 1.0934247970581055, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.48753124475479126, "step": 1235 }, { "completion_length": 113.0625, "epoch": 0.6613162118780096, "grad_norm": 65.76395416259766, "kl": 0.4014405608177185, "learning_rate": 4.782468734716465e-06, "loss": 0.0161, "reward": 2.4155311584472656, "reward_std": 0.44550079107284546, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4936562478542328, "step": 1236 }, { "completion_length": 140.65625, "epoch": 0.6618512573568753, "grad_norm": 1.750115990638733, "kl": 0.1834104359149933, "learning_rate": 4.781833269270516e-06, "loss": 0.0073, "reward": 1.72390615940094, "reward_std": 0.8262066841125488, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4270312488079071, "step": 1237 }, { "completion_length": 149.3125, "epoch": 0.6623863028357411, "grad_norm": 1.2739464044570923, "kl": 0.19484084844589233, "learning_rate": 4.781196919342266e-06, "loss": 0.0078, "reward": 1.7620000839233398, "reward_std": 0.8445612788200378, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4651249945163727, "step": 1238 }, { "completion_length": 133.21875, "epoch": 0.6629213483146067, "grad_norm": 2.8710052967071533, "kl": 0.17705552279949188, "learning_rate": 4.7805596851783765e-06, "loss": 0.0071, "reward": 2.428687572479248, "reward_std": 1.151891827583313, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.45993751287460327, "step": 1239 }, { "completion_length": 135.59375, "epoch": 0.6634563937934724, "grad_norm": 1.8552839756011963, "kl": 0.2486429214477539, "learning_rate": 4.779921567025851e-06, "loss": 0.0099, "reward": 1.2707188129425049, "reward_std": 0.5767713189125061, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4113437533378601, "step": 1240 }, { "completion_length": 133.0, "epoch": 0.6639914392723382, "grad_norm": 3.8878190517425537, "kl": 0.18448835611343384, "learning_rate": 4.7792825651320375e-06, "loss": 0.0074, "reward": 1.37109375, "reward_std": 0.6383590698242188, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.46484375, "step": 1241 }, { "completion_length": 132.375, "epoch": 0.6645264847512039, "grad_norm": 3.790945529937744, "kl": 0.15838024020195007, "learning_rate": 4.778642679744624e-06, "loss": 0.0063, "reward": 2.0347187519073486, "reward_std": 0.936866283416748, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47221875190734863, "step": 1242 }, { "completion_length": 141.375, "epoch": 0.6650615302300695, "grad_norm": 2.1537623405456543, "kl": 0.14020612835884094, "learning_rate": 4.778001911111644e-06, "loss": 0.0056, "reward": 1.7431561946868896, "reward_std": 1.024482250213623, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4462812542915344, "step": 1243 }, { "completion_length": 132.375, "epoch": 0.6655965757089353, "grad_norm": 0.7581921219825745, "kl": 0.15193572640419006, "learning_rate": 4.77736025948147e-06, "loss": 0.0061, "reward": 1.98828125, "reward_std": 0.7618801593780518, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48828125, "step": 1244 }, { "completion_length": 130.375, "epoch": 0.666131621187801, "grad_norm": 4.97514533996582, "kl": 0.15155094861984253, "learning_rate": 4.77671772510282e-06, "loss": 0.0061, "reward": 1.8463749885559082, "reward_std": 0.604564368724823, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4713749885559082, "step": 1245 }, { "completion_length": 153.875, "epoch": 0.6666666666666666, "grad_norm": 1.6581162214279175, "kl": 0.1605420857667923, "learning_rate": 4.77607430822475e-06, "loss": 0.0064, "reward": 2.0711562633514404, "reward_std": 0.9883583784103394, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46178126335144043, "step": 1246 }, { "completion_length": 134.40625, "epoch": 0.6672017121455324, "grad_norm": 0.7822808623313904, "kl": 0.1715381145477295, "learning_rate": 4.775430009096665e-06, "loss": 0.0069, "reward": 1.7082812786102295, "reward_std": 0.9537403583526611, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4270312488079071, "step": 1247 }, { "completion_length": 156.09375, "epoch": 0.6677367576243981, "grad_norm": 0.7341549396514893, "kl": 0.20433583855628967, "learning_rate": 4.774784827968304e-06, "loss": 0.0082, "reward": 1.573218822479248, "reward_std": 0.8186725378036499, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.40134376287460327, "step": 1248 }, { "completion_length": 129.75, "epoch": 0.6682718031032637, "grad_norm": 8.49620532989502, "kl": 0.270000696182251, "learning_rate": 4.774138765089753e-06, "loss": 0.0108, "reward": 1.87890625, "reward_std": 0.8477107286453247, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48828125, "step": 1249 }, { "completion_length": 126.625, "epoch": 0.6688068485821295, "grad_norm": 1.9736534357070923, "kl": 0.288747638463974, "learning_rate": 4.773491820711439e-06, "loss": 0.0115, "reward": 1.62890625, "reward_std": 0.8517700433731079, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48828125, "step": 1250 }, { "completion_length": 128.375, "epoch": 0.6693418940609952, "grad_norm": 1.7849953174591064, "kl": 0.2256479263305664, "learning_rate": 4.772843995084128e-06, "loss": 0.009, "reward": 2.0810937881469727, "reward_std": 0.9125122427940369, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.45609375834465027, "step": 1251 }, { "completion_length": 146.9375, "epoch": 0.6698769395398609, "grad_norm": 0.8078083992004395, "kl": 0.21349366009235382, "learning_rate": 4.7721952884589314e-06, "loss": 0.0085, "reward": 1.3404375314712524, "reward_std": 0.7242166996002197, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.43418750166893005, "step": 1252 }, { "completion_length": 122.125, "epoch": 0.6704119850187266, "grad_norm": 2.2759575843811035, "kl": 0.16682681441307068, "learning_rate": 4.7715457010873e-06, "loss": 0.0067, "reward": 2.577812671661377, "reward_std": 0.6149824857711792, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4996874928474426, "step": 1253 }, { "completion_length": 100.4375, "epoch": 0.6709470304975923, "grad_norm": 4.716650485992432, "kl": 0.5369620323181152, "learning_rate": 4.770895233221026e-06, "loss": 0.0215, "reward": 2.4045000076293945, "reward_std": 0.8340545296669006, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.49825000762939453, "step": 1254 }, { "completion_length": 122.6875, "epoch": 0.671482075976458, "grad_norm": 0.991934061050415, "kl": 0.20523706078529358, "learning_rate": 4.770243885112243e-06, "loss": 0.0082, "reward": 2.36328125, "reward_std": 0.7939968109130859, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48828125, "step": 1255 }, { "completion_length": 120.71875, "epoch": 0.6720171214553237, "grad_norm": 3.7842161655426025, "kl": 0.18940886855125427, "learning_rate": 4.769591657013427e-06, "loss": 0.0076, "reward": 2.218843698501587, "reward_std": 0.94798743724823, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4688437581062317, "step": 1256 }, { "completion_length": 135.53125, "epoch": 0.6725521669341894, "grad_norm": 1.7154995203018188, "kl": 0.16756147146224976, "learning_rate": 4.7689385491773934e-06, "loss": 0.0067, "reward": 2.4440624713897705, "reward_std": 0.7388110160827637, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4596875011920929, "step": 1257 }, { "completion_length": 130.25, "epoch": 0.6730872124130551, "grad_norm": 178825.390625, "kl": 11586.884765625, "learning_rate": 4.768284561857299e-06, "loss": 463.4754, "reward": 2.091343641281128, "reward_std": 1.1941665410995483, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4507187604904175, "step": 1258 }, { "completion_length": 150.46875, "epoch": 0.6736222578919208, "grad_norm": 1.702048420906067, "kl": 0.22347639501094818, "learning_rate": 4.767629695306642e-06, "loss": 0.0089, "reward": 1.4104686975479126, "reward_std": 0.6220147013664246, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4573437571525574, "step": 1259 }, { "completion_length": 128.03125, "epoch": 0.6741573033707865, "grad_norm": 0.9223964810371399, "kl": 0.15268173813819885, "learning_rate": 4.766973949779261e-06, "loss": 0.0061, "reward": 2.375, "reward_std": 0.8974630832672119, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.46875, "step": 1260 }, { "completion_length": 129.40625, "epoch": 0.6746923488496522, "grad_norm": 1.914715051651001, "kl": 0.1543698012828827, "learning_rate": 4.766317325529337e-06, "loss": 0.0062, "reward": 1.8783438205718994, "reward_std": 0.7756062746047974, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.44084376096725464, "step": 1261 }, { "completion_length": 160.90625, "epoch": 0.675227394328518, "grad_norm": 2380.500244140625, "kl": 303.3290710449219, "learning_rate": 4.765659822811388e-06, "loss": 12.1332, "reward": 1.7402812242507935, "reward_std": 1.0411005020141602, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.41215625405311584, "step": 1262 }, { "completion_length": 140.375, "epoch": 0.6757624398073836, "grad_norm": 0.874347448348999, "kl": 0.1492132991552353, "learning_rate": 4.765001441880276e-06, "loss": 0.006, "reward": 1.8242499828338623, "reward_std": 0.4129515290260315, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4492499828338623, "step": 1263 }, { "completion_length": 121.71875, "epoch": 0.6762974852862493, "grad_norm": 0.9083854556083679, "kl": 0.21329209208488464, "learning_rate": 4.764342182991203e-06, "loss": 0.0085, "reward": 2.3149685859680176, "reward_std": 0.8288516998291016, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4712187647819519, "step": 1264 }, { "completion_length": 124.96875, "epoch": 0.6768325307651151, "grad_norm": 37.502769470214844, "kl": 0.7173722982406616, "learning_rate": 4.763682046399708e-06, "loss": 0.0287, "reward": 2.53125, "reward_std": 0.8996846079826355, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.5, "step": 1265 }, { "completion_length": 141.46875, "epoch": 0.6773675762439807, "grad_norm": 0.6347408294677734, "kl": 0.18469586968421936, "learning_rate": 4.763021032361674e-06, "loss": 0.0074, "reward": 2.0795936584472656, "reward_std": 0.9353722333908081, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4077187478542328, "step": 1266 }, { "completion_length": 132.90625, "epoch": 0.6779026217228464, "grad_norm": 0.6380177736282349, "kl": 0.1623004525899887, "learning_rate": 4.762359141133322e-06, "loss": 0.0065, "reward": 2.3046875, "reward_std": 0.6539685726165771, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4765625, "step": 1267 }, { "completion_length": 133.09375, "epoch": 0.6784376672017122, "grad_norm": 3.029404640197754, "kl": 0.18530422449111938, "learning_rate": 4.761696372971214e-06, "loss": 0.0075, "reward": 2.4897189140319824, "reward_std": 0.5351591110229492, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4272187352180481, "step": 1268 }, { "completion_length": 144.8125, "epoch": 0.6789727126805778, "grad_norm": 3.094346046447754, "kl": 0.18356800079345703, "learning_rate": 4.761032728132253e-06, "loss": 0.0073, "reward": 1.353874921798706, "reward_std": 0.6562768220901489, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.43199998140335083, "step": 1269 }, { "completion_length": 126.0625, "epoch": 0.6795077581594435, "grad_norm": 0.7573635578155518, "kl": 0.19417022168636322, "learning_rate": 4.760368206873679e-06, "loss": 0.0078, "reward": 2.7771873474121094, "reward_std": 0.5114620327949524, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4646874964237213, "step": 1270 }, { "completion_length": 144.34375, "epoch": 0.6800428036383093, "grad_norm": 1.2013953924179077, "kl": 0.18700087070465088, "learning_rate": 4.759702809453073e-06, "loss": 0.0075, "reward": 1.8595938682556152, "reward_std": 0.7247844338417053, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4533437490463257, "step": 1271 }, { "completion_length": 151.09375, "epoch": 0.680577849117175, "grad_norm": 0.9315585494041443, "kl": 0.19387568533420563, "learning_rate": 4.759036536128356e-06, "loss": 0.0078, "reward": 1.716437578201294, "reward_std": 0.8420247435569763, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.43518751859664917, "step": 1272 }, { "completion_length": 142.40625, "epoch": 0.6811128945960406, "grad_norm": 1.0408626794815063, "kl": 0.144597128033638, "learning_rate": 4.758369387157789e-06, "loss": 0.0058, "reward": 1.89453125, "reward_std": 0.7379032969474792, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.48828125, "step": 1273 }, { "completion_length": 127.9375, "epoch": 0.6816479400749064, "grad_norm": 1.0082776546478271, "kl": 0.18365536630153656, "learning_rate": 4.757701362799972e-06, "loss": 0.0073, "reward": 1.9030624628067017, "reward_std": 1.0716605186462402, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.43431249260902405, "step": 1274 }, { "completion_length": 139.09375, "epoch": 0.6821829855537721, "grad_norm": 1.7575260400772095, "kl": 0.14538279175758362, "learning_rate": 4.757032463313842e-06, "loss": 0.0058, "reward": 1.3303437232971191, "reward_std": 0.6646837592124939, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.47096875309944153, "step": 1275 }, { "completion_length": 130.71875, "epoch": 0.6827180310326377, "grad_norm": 1.6088534593582153, "kl": 0.206363245844841, "learning_rate": 4.756362688958679e-06, "loss": 0.0083, "reward": 1.7292499542236328, "reward_std": 0.8537540435791016, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4480000138282776, "step": 1276 }, { "completion_length": 136.90625, "epoch": 0.6832530765115035, "grad_norm": 2.398103952407837, "kl": 0.1489579826593399, "learning_rate": 4.7556920399940995e-06, "loss": 0.006, "reward": 2.113781213760376, "reward_std": 1.1549999713897705, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.44190624356269836, "step": 1277 }, { "completion_length": 120.53125, "epoch": 0.6837881219903692, "grad_norm": 0.8736487030982971, "kl": 0.167108952999115, "learning_rate": 4.755020516680061e-06, "loss": 0.0067, "reward": 1.808781385421753, "reward_std": 0.4521641731262207, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4806562662124634, "step": 1278 }, { "completion_length": 151.15625, "epoch": 0.6843231674692349, "grad_norm": 1.3872891664505005, "kl": 0.16717393696308136, "learning_rate": 4.754348119276858e-06, "loss": 0.0067, "reward": 1.76953125, "reward_std": 0.466144323348999, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.44140625, "step": 1279 }, { "completion_length": 135.71875, "epoch": 0.6848582129481006, "grad_norm": 1.8419647216796875, "kl": 0.17536889016628265, "learning_rate": 4.753674848045126e-06, "loss": 0.007, "reward": 1.4329062700271606, "reward_std": 0.374480277299881, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46415624022483826, "step": 1280 }, { "completion_length": 151.90625, "epoch": 0.6853932584269663, "grad_norm": 1337.5413818359375, "kl": 122.83477020263672, "learning_rate": 4.753000703245835e-06, "loss": 4.9134, "reward": 1.5361249446868896, "reward_std": 0.9982843399047852, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4267500042915344, "step": 1281 }, { "completion_length": 121.71875, "epoch": 0.685928303905832, "grad_norm": 0.7466825246810913, "kl": 0.164460688829422, "learning_rate": 4.752325685140298e-06, "loss": 0.0066, "reward": 2.4453125, "reward_std": 0.9801732897758484, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4765625, "step": 1282 }, { "completion_length": 129.09375, "epoch": 0.6864633493846977, "grad_norm": 0.9428567886352539, "kl": 0.2269175499677658, "learning_rate": 4.7516497939901655e-06, "loss": 0.0091, "reward": 2.2104687690734863, "reward_std": 0.9593601822853088, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.47609376907348633, "step": 1283 }, { "completion_length": 141.34375, "epoch": 0.6869983948635634, "grad_norm": 2.271610736846924, "kl": 0.3767840564250946, "learning_rate": 4.750973030057425e-06, "loss": 0.0151, "reward": 2.160468816757202, "reward_std": 1.1095123291015625, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4729687571525574, "step": 1284 }, { "completion_length": 141.21875, "epoch": 0.6875334403424291, "grad_norm": 2.850010871887207, "kl": 0.1839030683040619, "learning_rate": 4.7502953936044035e-06, "loss": 0.0074, "reward": 2.0849688053131104, "reward_std": 1.131287693977356, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4287187457084656, "step": 1285 }, { "completion_length": 150.84375, "epoch": 0.6880684858212948, "grad_norm": 1.4872922897338867, "kl": 0.1907801330089569, "learning_rate": 4.749616884893767e-06, "loss": 0.0076, "reward": 1.307031273841858, "reward_std": 0.6604703664779663, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.28125, "rewards/xmlcount_reward_func": 0.4164062738418579, "step": 1286 }, { "completion_length": 143.0625, "epoch": 0.6886035313001605, "grad_norm": 0.8385428190231323, "kl": 0.15425245463848114, "learning_rate": 4.7489375041885166e-06, "loss": 0.0062, "reward": 1.980562448501587, "reward_std": 0.969718337059021, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4180625081062317, "step": 1287 }, { "completion_length": 149.1875, "epoch": 0.6891385767790262, "grad_norm": 0.7882683277130127, "kl": 0.13382241129875183, "learning_rate": 4.748257251751994e-06, "loss": 0.0054, "reward": 1.5379999876022339, "reward_std": 0.7616504430770874, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4130000174045563, "step": 1288 }, { "completion_length": 118.9375, "epoch": 0.689673622257892, "grad_norm": 4175396.75, "kl": 17620.9296875, "learning_rate": 4.747576127847878e-06, "loss": 704.8372, "reward": 2.1519062519073486, "reward_std": 0.825492262840271, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.48003125190734863, "step": 1289 }, { "completion_length": 123.0, "epoch": 0.6902086677367576, "grad_norm": 2.182110071182251, "kl": 0.18718384206295013, "learning_rate": 4.746894132740185e-06, "loss": 0.0075, "reward": 2.0409374237060547, "reward_std": 1.0525320768356323, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47843748331069946, "step": 1290 }, { "completion_length": 144.71875, "epoch": 0.6907437132156233, "grad_norm": 3.437016487121582, "kl": 0.29224321246147156, "learning_rate": 4.7462112666932715e-06, "loss": 0.0117, "reward": 1.4081249237060547, "reward_std": 0.6428443789482117, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.42374998331069946, "step": 1291 }, { "completion_length": 130.0, "epoch": 0.6912787586944891, "grad_norm": 1.0030187368392944, "kl": 0.12398919463157654, "learning_rate": 4.745527529971826e-06, "loss": 0.005, "reward": 2.4165937900543213, "reward_std": 1.1162359714508057, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4634687304496765, "step": 1292 }, { "completion_length": 113.375, "epoch": 0.6918138041733547, "grad_norm": 1.7940826416015625, "kl": 0.2641543745994568, "learning_rate": 4.744842922840881e-06, "loss": 0.0106, "reward": 1.818718671798706, "reward_std": 0.6761373281478882, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4905937612056732, "step": 1293 }, { "completion_length": 125.5, "epoch": 0.6923488496522204, "grad_norm": 1.2547358274459839, "kl": 0.16837486624717712, "learning_rate": 4.744157445565801e-06, "loss": 0.0067, "reward": 2.6757187843322754, "reward_std": 0.5963508486747742, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.472593754529953, "step": 1294 }, { "completion_length": 140.25, "epoch": 0.6928838951310862, "grad_norm": 2.25240421295166, "kl": 0.2426927238702774, "learning_rate": 4.743471098412292e-06, "loss": 0.0097, "reward": 1.9514374732971191, "reward_std": 0.9592258930206299, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46706250309944153, "step": 1295 }, { "completion_length": 135.90625, "epoch": 0.6934189406099518, "grad_norm": 8.777785301208496, "kl": 0.22325043380260468, "learning_rate": 4.742783881646394e-06, "loss": 0.0089, "reward": 2.0920000076293945, "reward_std": 1.1802877187728882, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.46700000762939453, "step": 1296 }, { "completion_length": 137.25, "epoch": 0.6939539860888175, "grad_norm": 4377912320.0, "kl": 29854790.0, "learning_rate": 4.742095795534486e-06, "loss": 1194191.625, "reward": 1.54296875, "reward_std": 0.6731728315353394, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.41796875, "step": 1297 }, { "completion_length": 114.78125, "epoch": 0.6944890315676833, "grad_norm": 1.4366084337234497, "kl": 0.1572728157043457, "learning_rate": 4.741406840343283e-06, "loss": 0.0063, "reward": 2.46875, "reward_std": 0.7880350351333618, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.5, "step": 1298 }, { "completion_length": 145.0625, "epoch": 0.695024077046549, "grad_norm": 1865.6702880859375, "kl": 242.40615844726562, "learning_rate": 4.740717016339837e-06, "loss": 9.6962, "reward": 2.01590633392334, "reward_std": 1.1054743528366089, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4690312445163727, "step": 1299 }, { "completion_length": 158.65625, "epoch": 0.6955591225254146, "grad_norm": 2976.78759765625, "kl": 82.48163604736328, "learning_rate": 4.740026323791539e-06, "loss": 3.2993, "reward": 1.92759370803833, "reward_std": 0.8269764184951782, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.45884376764297485, "step": 1300 }, { "completion_length": 158.6875, "epoch": 0.6960941680042804, "grad_norm": 1.1530253887176514, "kl": 0.18029840290546417, "learning_rate": 4.7393347629661116e-06, "loss": 0.0072, "reward": 1.3776562213897705, "reward_std": 0.6689630150794983, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.3932812511920929, "step": 1301 }, { "completion_length": 129.15625, "epoch": 0.6966292134831461, "grad_norm": 3.0829005241394043, "kl": 0.2078336477279663, "learning_rate": 4.73864233413162e-06, "loss": 0.0083, "reward": 1.2104687690734863, "reward_std": 0.5949475765228271, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.46046873927116394, "step": 1302 }, { "completion_length": 133.625, "epoch": 0.6971642589620117, "grad_norm": 4.463025093078613, "kl": 0.23342996835708618, "learning_rate": 4.73794903755646e-06, "loss": 0.0093, "reward": 2.0202813148498535, "reward_std": 0.8135800361633301, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.48903125524520874, "step": 1303 }, { "completion_length": 128.96875, "epoch": 0.6976993044408775, "grad_norm": 12925363200.0, "kl": 10604119.0, "learning_rate": 4.73725487350937e-06, "loss": 424164.7812, "reward": 1.9375, "reward_std": 0.694037914276123, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.5, "step": 1304 }, { "completion_length": 128.78125, "epoch": 0.6982343499197432, "grad_norm": 1.4223800897598267, "kl": 0.17248988151550293, "learning_rate": 4.736559842259417e-06, "loss": 0.0069, "reward": 2.1561875343322754, "reward_std": 0.9032754898071289, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.484312504529953, "step": 1305 }, { "completion_length": 112.375, "epoch": 0.6987693953986088, "grad_norm": 0.8622397184371948, "kl": 0.18255993723869324, "learning_rate": 4.735863944076012e-06, "loss": 0.0073, "reward": 2.141031265258789, "reward_std": 0.6279072761535645, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4847812354564667, "step": 1306 }, { "completion_length": 133.0, "epoch": 0.6993044408774746, "grad_norm": 2065028352.0, "kl": 19116792.0, "learning_rate": 4.735167179228898e-06, "loss": 764671.625, "reward": 2.2172813415527344, "reward_std": 1.015663504600525, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4829062521457672, "step": 1307 }, { "completion_length": 138.96875, "epoch": 0.6998394863563403, "grad_norm": 3.285486936569214, "kl": 0.23158323764801025, "learning_rate": 4.734469547988152e-06, "loss": 0.0093, "reward": 1.3828125, "reward_std": 0.3275107145309448, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4921875, "step": 1308 }, { "completion_length": 137.28125, "epoch": 0.700374531835206, "grad_norm": 0.6134077310562134, "kl": 0.14342710375785828, "learning_rate": 4.733771050624192e-06, "loss": 0.0057, "reward": 2.05859375, "reward_std": 0.737153172492981, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46484375, "step": 1309 }, { "completion_length": 135.96875, "epoch": 0.7009095773140717, "grad_norm": 1.5114985704421997, "kl": 0.21078543365001678, "learning_rate": 4.7330716874077675e-06, "loss": 0.0084, "reward": 2.68359375, "reward_std": 1.0742943286895752, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.49609375, "step": 1310 }, { "completion_length": 123.34375, "epoch": 0.7014446227929374, "grad_norm": 2.163844585418701, "kl": 0.1955692172050476, "learning_rate": 4.7323714586099664e-06, "loss": 0.0078, "reward": 1.8813750743865967, "reward_std": 0.9015467166900635, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4751250147819519, "step": 1311 }, { "completion_length": 121.59375, "epoch": 0.7019796682718031, "grad_norm": 13.404565811157227, "kl": 0.5167250633239746, "learning_rate": 4.731670364502209e-06, "loss": 0.0207, "reward": 1.8936874866485596, "reward_std": 0.8635177612304688, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.47181248664855957, "step": 1312 }, { "completion_length": 121.375, "epoch": 0.7025147137506688, "grad_norm": 1.0102686882019043, "kl": 0.18925414979457855, "learning_rate": 4.730968405356254e-06, "loss": 0.0076, "reward": 2.1108124256134033, "reward_std": 0.5137800574302673, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4701874852180481, "step": 1313 }, { "completion_length": 125.625, "epoch": 0.7030497592295345, "grad_norm": 10.778743743896484, "kl": 0.5265287160873413, "learning_rate": 4.730265581444193e-06, "loss": 0.0211, "reward": 2.55859375, "reward_std": 0.9907503724098206, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.49609375, "step": 1314 }, { "completion_length": 132.96875, "epoch": 0.7035848047084002, "grad_norm": 0.920703649520874, "kl": 0.1872105896472931, "learning_rate": 4.7295618930384554e-06, "loss": 0.0075, "reward": 1.7867813110351562, "reward_std": 0.7707017660140991, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4586562514305115, "step": 1315 }, { "completion_length": 109.875, "epoch": 0.704119850187266, "grad_norm": 2.291936159133911, "kl": 0.21146464347839355, "learning_rate": 4.728857340411805e-06, "loss": 0.0085, "reward": 2.421875, "reward_std": 1.1263251304626465, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.5, "step": 1316 }, { "completion_length": 124.25, "epoch": 0.7046548956661316, "grad_norm": 0.831163227558136, "kl": 0.2078067809343338, "learning_rate": 4.728151923837337e-06, "loss": 0.0083, "reward": 2.64453125, "reward_std": 0.9141651391983032, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48828125, "step": 1317 }, { "completion_length": 126.875, "epoch": 0.7051899411449973, "grad_norm": 1.3862687349319458, "kl": 0.2239042967557907, "learning_rate": 4.7274456435884875e-06, "loss": 0.009, "reward": 2.403843879699707, "reward_std": 0.7910365462303162, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4507187306880951, "step": 1318 }, { "completion_length": 128.46875, "epoch": 0.7057249866238631, "grad_norm": 3.875558614730835, "kl": 0.1571407914161682, "learning_rate": 4.726738499939022e-06, "loss": 0.0063, "reward": 2.036437511444092, "reward_std": 0.8958250880241394, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4739375114440918, "step": 1319 }, { "completion_length": 120.03125, "epoch": 0.7062600321027287, "grad_norm": 0.9517608284950256, "kl": 0.20373332500457764, "learning_rate": 4.726030493163044e-06, "loss": 0.0081, "reward": 2.274843692779541, "reward_std": 0.718026340007782, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4779687523841858, "step": 1320 }, { "completion_length": 122.71875, "epoch": 0.7067950775815944, "grad_norm": 1.5733386278152466, "kl": 0.19045737385749817, "learning_rate": 4.725321623534991e-06, "loss": 0.0076, "reward": 2.25, "reward_std": 0.7400826215744019, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.5, "step": 1321 }, { "completion_length": 143.125, "epoch": 0.7073301230604602, "grad_norm": 4.5912251472473145, "kl": 0.2263915240764618, "learning_rate": 4.724611891329633e-06, "loss": 0.0091, "reward": 1.2927813529968262, "reward_std": 0.3218250274658203, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.464656263589859, "step": 1322 }, { "completion_length": 148.1875, "epoch": 0.7078651685393258, "grad_norm": 1.4132548570632935, "kl": 0.1694340705871582, "learning_rate": 4.723901296822076e-06, "loss": 0.0068, "reward": 1.6106562614440918, "reward_std": 0.4504537582397461, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4700312614440918, "step": 1323 }, { "completion_length": 100.625, "epoch": 0.7084002140181915, "grad_norm": 0.9050006866455078, "kl": 0.220991313457489, "learning_rate": 4.723189840287762e-06, "loss": 0.0088, "reward": 1.9029061794281006, "reward_std": 0.8125878572463989, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48103123903274536, "step": 1324 }, { "completion_length": 113.46875, "epoch": 0.7089352594970573, "grad_norm": 2.0538883209228516, "kl": 0.27245426177978516, "learning_rate": 4.722477522002463e-06, "loss": 0.0109, "reward": 1.56640625, "reward_std": 0.702547550201416, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48828125, "step": 1325 }, { "completion_length": 159.5625, "epoch": 0.709470304975923, "grad_norm": 1.464745283126831, "kl": 0.12753552198410034, "learning_rate": 4.721764342242288e-06, "loss": 0.0051, "reward": 1.6747187376022339, "reward_std": 0.8792353868484497, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4090937376022339, "step": 1326 }, { "completion_length": 160.46875, "epoch": 0.7100053504547886, "grad_norm": 0.8685373067855835, "kl": 0.16474100947380066, "learning_rate": 4.7210503012836786e-06, "loss": 0.0066, "reward": 1.5273125171661377, "reward_std": 0.6918765902519226, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4491875171661377, "step": 1327 }, { "completion_length": 141.9375, "epoch": 0.7105403959336544, "grad_norm": 1.5680643320083618, "kl": 0.19196996092796326, "learning_rate": 4.720335399403413e-06, "loss": 0.0077, "reward": 1.5694999694824219, "reward_std": 0.7508977055549622, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.42887499928474426, "step": 1328 }, { "completion_length": 141.84375, "epoch": 0.7110754414125201, "grad_norm": 3.9197275638580322, "kl": 0.26304665207862854, "learning_rate": 4.719619636878597e-06, "loss": 0.0105, "reward": 1.6775312423706055, "reward_std": 0.8337301015853882, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.44315624237060547, "step": 1329 }, { "completion_length": 127.15625, "epoch": 0.7116104868913857, "grad_norm": 2.6652965545654297, "kl": 0.193008154630661, "learning_rate": 4.718903013986678e-06, "loss": 0.0077, "reward": 2.140625, "reward_std": 0.9434360265731812, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.5, "step": 1330 }, { "completion_length": 151.4375, "epoch": 0.7121455323702515, "grad_norm": 382.25286865234375, "kl": 42.793235778808594, "learning_rate": 4.71818553100543e-06, "loss": 1.7117, "reward": 1.4520937204360962, "reward_std": 0.8861548900604248, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.3895937502384186, "step": 1331 }, { "completion_length": 126.96875, "epoch": 0.7126805778491172, "grad_norm": 2.235673189163208, "kl": 0.19064460694789886, "learning_rate": 4.717467188212963e-06, "loss": 0.0076, "reward": 1.8115313053131104, "reward_std": 0.7256323099136353, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4834062457084656, "step": 1332 }, { "completion_length": 145.96875, "epoch": 0.7132156233279828, "grad_norm": 1.4019235372543335, "kl": 0.17031921446323395, "learning_rate": 4.716747985887722e-06, "loss": 0.0068, "reward": 1.449125051498413, "reward_std": 0.8071049451828003, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4334999918937683, "step": 1333 }, { "completion_length": 114.375, "epoch": 0.7137506688068486, "grad_norm": 1.1770586967468262, "kl": 0.1800549328327179, "learning_rate": 4.716027924308483e-06, "loss": 0.0072, "reward": 2.4375, "reward_std": 0.7705072164535522, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.5, "step": 1334 }, { "completion_length": 153.03125, "epoch": 0.7142857142857143, "grad_norm": 5.704557418823242, "kl": 0.14275217056274414, "learning_rate": 4.715307003754356e-06, "loss": 0.0057, "reward": 2.262406349182129, "reward_std": 0.7929638624191284, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46553125977516174, "step": 1335 }, { "completion_length": 134.09375, "epoch": 0.71482075976458, "grad_norm": 2.2906436920166016, "kl": 0.19375388324260712, "learning_rate": 4.714585224504783e-06, "loss": 0.0078, "reward": 1.82421875, "reward_std": 0.7679173946380615, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.46484375, "step": 1336 }, { "completion_length": 131.96875, "epoch": 0.7153558052434457, "grad_norm": 1.0257432460784912, "kl": 0.1687902808189392, "learning_rate": 4.71386258683954e-06, "loss": 0.0068, "reward": 1.7803125381469727, "reward_std": 0.7298128604888916, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.40531250834465027, "step": 1337 }, { "completion_length": 140.4375, "epoch": 0.7158908507223114, "grad_norm": 6.382419109344482, "kl": 0.3051040470600128, "learning_rate": 4.713139091038735e-06, "loss": 0.0122, "reward": 1.8963749408721924, "reward_std": 0.7897579073905945, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.44325000047683716, "step": 1338 }, { "completion_length": 125.09375, "epoch": 0.7164258962011771, "grad_norm": 1.9343609809875488, "kl": 0.32812628149986267, "learning_rate": 4.7124147373828085e-06, "loss": 0.0131, "reward": 1.7570624351501465, "reward_std": 0.9907123446464539, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46018749475479126, "step": 1339 }, { "completion_length": 131.34375, "epoch": 0.7169609416800428, "grad_norm": 0.8411646485328674, "kl": 0.14162173867225647, "learning_rate": 4.711689526152534e-06, "loss": 0.0057, "reward": 2.3277812004089355, "reward_std": 0.5464677810668945, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4684062600135803, "step": 1340 }, { "completion_length": 152.75, "epoch": 0.7174959871589085, "grad_norm": 0.9288503527641296, "kl": 0.13082203269004822, "learning_rate": 4.710963457629018e-06, "loss": 0.0052, "reward": 1.5866875648498535, "reward_std": 1.103079080581665, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.41481250524520874, "step": 1341 }, { "completion_length": 128.78125, "epoch": 0.7180310326377742, "grad_norm": 0.8000268936157227, "kl": 0.2038874477148056, "learning_rate": 4.710236532093697e-06, "loss": 0.0082, "reward": 1.9112812280654907, "reward_std": 0.874223530292511, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4737812578678131, "step": 1342 }, { "completion_length": 126.375, "epoch": 0.7185660781166399, "grad_norm": 0.8772702813148499, "kl": 0.18635523319244385, "learning_rate": 4.709508749828343e-06, "loss": 0.0075, "reward": 2.2015938758850098, "reward_std": 1.1201448440551758, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4672187566757202, "step": 1343 }, { "completion_length": 137.3125, "epoch": 0.7191011235955056, "grad_norm": 9975778.0, "kl": 1409295.5, "learning_rate": 4.708780111115058e-06, "loss": 56371.8242, "reward": 2.308468818664551, "reward_std": 0.8394098281860352, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.464718759059906, "step": 1344 }, { "completion_length": 118.46875, "epoch": 0.7196361690743713, "grad_norm": 1.2860960960388184, "kl": 0.16906490921974182, "learning_rate": 4.708050616236275e-06, "loss": 0.0068, "reward": 2.06640625, "reward_std": 0.567266583442688, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48828125, "step": 1345 }, { "completion_length": 126.6875, "epoch": 0.7201712145532371, "grad_norm": 1.1195766925811768, "kl": 0.17055411636829376, "learning_rate": 4.707320265474762e-06, "loss": 0.0068, "reward": 1.8695625066757202, "reward_std": 0.8883811831474304, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4789375066757202, "step": 1346 }, { "completion_length": 133.9375, "epoch": 0.7207062600321027, "grad_norm": 0.6438809037208557, "kl": 0.1672963798046112, "learning_rate": 4.706589059113615e-06, "loss": 0.0067, "reward": 2.4531874656677246, "reward_std": 0.6293826103210449, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.468812495470047, "step": 1347 }, { "completion_length": 109.96875, "epoch": 0.7212413055109684, "grad_norm": 0.6036624908447266, "kl": 0.1486901044845581, "learning_rate": 4.705856997436266e-06, "loss": 0.0059, "reward": 2.4040937423706055, "reward_std": 0.696326494216919, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.48221874237060547, "step": 1348 }, { "completion_length": 129.78125, "epoch": 0.7217763509898342, "grad_norm": 374628576.0, "kl": 6876485.5, "learning_rate": 4.705124080726473e-06, "loss": 275059.4375, "reward": 2.71875, "reward_std": 0.8805594444274902, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.484375, "step": 1349 }, { "completion_length": 131.84375, "epoch": 0.7223113964686998, "grad_norm": 4.271143436431885, "kl": 0.18149563670158386, "learning_rate": 4.704390309268332e-06, "loss": 0.0073, "reward": 2.1482186317443848, "reward_std": 0.9303598403930664, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4763437509536743, "step": 1350 }, { "completion_length": 135.40625, "epoch": 0.7228464419475655, "grad_norm": 1.9272651672363281, "kl": 0.22856232523918152, "learning_rate": 4.703655683346264e-06, "loss": 0.0091, "reward": 2.809593677520752, "reward_std": 0.8516982793807983, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.48146873712539673, "step": 1351 }, { "completion_length": 148.3125, "epoch": 0.7233814874264313, "grad_norm": 3205.32568359375, "kl": 5.226241588592529, "learning_rate": 4.702920203245026e-06, "loss": 0.209, "reward": 2.441281318664551, "reward_std": 0.7010500431060791, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.441281259059906, "step": 1352 }, { "completion_length": 114.625, "epoch": 0.723916532905297, "grad_norm": 4.47627592086792, "kl": 0.5174469947814941, "learning_rate": 4.702183869249705e-06, "loss": 0.0207, "reward": 2.2531561851501465, "reward_std": 1.141735553741455, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.47190624475479126, "step": 1353 }, { "completion_length": 146.09375, "epoch": 0.7244515783841626, "grad_norm": 179.1688995361328, "kl": 2.2140231132507324, "learning_rate": 4.701446681645714e-06, "loss": 0.0886, "reward": 2.1793437004089355, "reward_std": 0.7415843605995178, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.3980937600135803, "step": 1354 }, { "completion_length": 133.09375, "epoch": 0.7249866238630284, "grad_norm": 1.878449559211731, "kl": 0.3128138780593872, "learning_rate": 4.7007086407188054e-06, "loss": 0.0125, "reward": 1.78125, "reward_std": 0.7914295196533203, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.484375, "step": 1355 }, { "completion_length": 131.96875, "epoch": 0.7255216693418941, "grad_norm": 1.1624250411987305, "kl": 0.27090349793434143, "learning_rate": 4.699969746755056e-06, "loss": 0.0108, "reward": 2.372218608856201, "reward_std": 0.7246734499931335, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4659687578678131, "step": 1356 }, { "completion_length": 152.40625, "epoch": 0.7260567148207597, "grad_norm": 1.6341878175735474, "kl": 0.2070818394422531, "learning_rate": 4.6992300000408755e-06, "loss": 0.0083, "reward": 1.828125, "reward_std": 0.8389663696289062, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.484375, "step": 1357 }, { "completion_length": 135.34375, "epoch": 0.7265917602996255, "grad_norm": 1.466042399406433, "kl": 0.20918051898479462, "learning_rate": 4.698489400863004e-06, "loss": 0.0084, "reward": 1.9170000553131104, "reward_std": 0.8457155227661133, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4638749957084656, "step": 1358 }, { "completion_length": 134.0, "epoch": 0.7271268057784912, "grad_norm": 1.2057594060897827, "kl": 0.1598740518093109, "learning_rate": 4.697747949508512e-06, "loss": 0.0064, "reward": 2.2109375, "reward_std": 0.6693140268325806, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4453125, "step": 1359 }, { "completion_length": 134.625, "epoch": 0.7276618512573568, "grad_norm": 1.5499650239944458, "kl": 0.19691705703735352, "learning_rate": 4.6970056462648e-06, "loss": 0.0079, "reward": 2.2578125, "reward_std": 0.3998047113418579, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4765625, "step": 1360 }, { "completion_length": 124.03125, "epoch": 0.7281968967362226, "grad_norm": 2.038050651550293, "kl": 0.3633975088596344, "learning_rate": 4.696262491419599e-06, "loss": 0.0145, "reward": 1.402843713760376, "reward_std": 0.5884992480278015, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.48096874356269836, "step": 1361 }, { "completion_length": 143.5, "epoch": 0.7287319422150883, "grad_norm": 1.6446948051452637, "kl": 0.18593093752861023, "learning_rate": 4.69551848526097e-06, "loss": 0.0074, "reward": 2.018125057220459, "reward_std": 1.1549994945526123, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4399999976158142, "step": 1362 }, { "completion_length": 143.9375, "epoch": 0.729266987693954, "grad_norm": 539.825927734375, "kl": 50.05784225463867, "learning_rate": 4.6947736280773044e-06, "loss": 2.0023, "reward": 1.95703125, "reward_std": 0.738165557384491, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.44140625, "step": 1363 }, { "completion_length": 135.5, "epoch": 0.7298020331728197, "grad_norm": 1.4726667404174805, "kl": 0.20174545049667358, "learning_rate": 4.694027920157324e-06, "loss": 0.0081, "reward": 1.8668749332427979, "reward_std": 0.9745560884475708, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4606249928474426, "step": 1364 }, { "completion_length": 139.5, "epoch": 0.7303370786516854, "grad_norm": 1.1223182678222656, "kl": 0.14913903176784515, "learning_rate": 4.693281361790078e-06, "loss": 0.006, "reward": 2.234375, "reward_std": 0.9234033823013306, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.453125, "step": 1365 }, { "completion_length": 140.40625, "epoch": 0.7308721241305511, "grad_norm": 257954.109375, "kl": 27610.1640625, "learning_rate": 4.692533953264946e-06, "loss": 1104.4065, "reward": 1.40625, "reward_std": 0.4780496656894684, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.453125, "step": 1366 }, { "completion_length": 156.59375, "epoch": 0.7314071696094168, "grad_norm": 1.4806997776031494, "kl": 0.12765654921531677, "learning_rate": 4.69178569487164e-06, "loss": 0.0051, "reward": 1.3133437633514404, "reward_std": 0.9007707834243774, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.40709376335144043, "step": 1367 }, { "completion_length": 125.34375, "epoch": 0.7319422150882825, "grad_norm": 0.7823727130889893, "kl": 0.215663343667984, "learning_rate": 4.691036586900199e-06, "loss": 0.0086, "reward": 1.8742499351501465, "reward_std": 0.9008055329322815, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48362499475479126, "step": 1368 }, { "completion_length": 122.0625, "epoch": 0.7324772605671482, "grad_norm": 0.8130596280097961, "kl": 0.17679819464683533, "learning_rate": 4.690286629640989e-06, "loss": 0.0071, "reward": 1.56640625, "reward_std": 0.6262367963790894, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47265625, "step": 1369 }, { "completion_length": 106.53125, "epoch": 0.7330123060460139, "grad_norm": 1.0411256551742554, "kl": 0.19864752888679504, "learning_rate": 4.689535823384711e-06, "loss": 0.0079, "reward": 2.703125, "reward_std": 0.9245278835296631, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.5, "step": 1370 }, { "completion_length": 124.90625, "epoch": 0.7335473515248796, "grad_norm": 0.5545085072517395, "kl": 0.17283323407173157, "learning_rate": 4.688784168422391e-06, "loss": 0.0069, "reward": 2.156125068664551, "reward_std": 0.5919922590255737, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.499875009059906, "step": 1371 }, { "completion_length": 146.0, "epoch": 0.7340823970037453, "grad_norm": 0.4930022954940796, "kl": 0.14492903649806976, "learning_rate": 4.688031665045383e-06, "loss": 0.0058, "reward": 1.66796875, "reward_std": 0.413887083530426, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46484375, "step": 1372 }, { "completion_length": 160.65625, "epoch": 0.7346174424826111, "grad_norm": 1.0702646970748901, "kl": 0.22213737666606903, "learning_rate": 4.6872783135453744e-06, "loss": 0.0089, "reward": 1.413812518119812, "reward_std": 0.8138837814331055, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.398187518119812, "step": 1373 }, { "completion_length": 128.09375, "epoch": 0.7351524879614767, "grad_norm": 0.887993574142456, "kl": 0.2229943573474884, "learning_rate": 4.686524114214377e-06, "loss": 0.0089, "reward": 1.9831249713897705, "reward_std": 0.4591113328933716, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4831250011920929, "step": 1374 }, { "completion_length": 123.90625, "epoch": 0.7356875334403424, "grad_norm": 0.7636374831199646, "kl": 0.18098369240760803, "learning_rate": 4.685769067344735e-06, "loss": 0.0072, "reward": 2.3966562747955322, "reward_std": 0.9871000051498413, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.49040624499320984, "step": 1375 }, { "completion_length": 139.4375, "epoch": 0.7362225789192082, "grad_norm": 1.5138267278671265, "kl": 0.16669829189777374, "learning_rate": 4.685013173229115e-06, "loss": 0.0067, "reward": 1.513124942779541, "reward_std": 0.8712751269340515, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4350000023841858, "step": 1376 }, { "completion_length": 158.9375, "epoch": 0.7367576243980738, "grad_norm": 889.21044921875, "kl": 194.2343292236328, "learning_rate": 4.68425643216052e-06, "loss": 7.7694, "reward": 1.429593801498413, "reward_std": 0.5926955938339233, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.3670937418937683, "step": 1377 }, { "completion_length": 120.5625, "epoch": 0.7372926698769395, "grad_norm": 226881.109375, "kl": 797.3062133789062, "learning_rate": 4.683498844432275e-06, "loss": 31.8922, "reward": 2.4974687099456787, "reward_std": 0.6661157011985779, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4818437397480011, "step": 1378 }, { "completion_length": 139.5625, "epoch": 0.7378277153558053, "grad_norm": 8.871941566467285, "kl": 1.2885386943817139, "learning_rate": 4.682740410338037e-06, "loss": 0.0515, "reward": 1.9802813529968262, "reward_std": 0.7778295278549194, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.449031263589859, "step": 1379 }, { "completion_length": 123.9375, "epoch": 0.7383627608346709, "grad_norm": 1.6754792928695679, "kl": 0.2162347137928009, "learning_rate": 4.681981130171789e-06, "loss": 0.0086, "reward": 2.3316874504089355, "reward_std": 0.9790078401565552, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.4723125100135803, "step": 1380 }, { "completion_length": 128.46875, "epoch": 0.7388978063135366, "grad_norm": 1.4767285585403442, "kl": 0.20364467799663544, "learning_rate": 4.681221004227842e-06, "loss": 0.0081, "reward": 2.2925937175750732, "reward_std": 0.8804377317428589, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.46446874737739563, "step": 1381 }, { "completion_length": 111.09375, "epoch": 0.7394328517924024, "grad_norm": 111897336.0, "kl": 251971.390625, "learning_rate": 4.680460032800837e-06, "loss": 10078.8555, "reward": 2.34375, "reward_std": 0.45023012161254883, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.5, "step": 1382 }, { "completion_length": 113.875, "epoch": 0.7399678972712681, "grad_norm": 1.0818202495574951, "kl": 0.20806919038295746, "learning_rate": 4.679698216185739e-06, "loss": 0.0083, "reward": 2.766906261444092, "reward_std": 1.1449379920959473, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4700312614440918, "step": 1383 }, { "completion_length": 153.03125, "epoch": 0.7405029427501337, "grad_norm": 1.3733043670654297, "kl": 0.15274131298065186, "learning_rate": 4.678935554677843e-06, "loss": 0.0061, "reward": 1.4293124675750732, "reward_std": 0.550453782081604, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46056249737739563, "step": 1384 }, { "completion_length": 153.21875, "epoch": 0.7410379882289995, "grad_norm": 1.3853915929794312, "kl": 0.12682349979877472, "learning_rate": 4.678172048572773e-06, "loss": 0.0051, "reward": 1.921781301498413, "reward_std": 0.9640448689460754, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4686562418937683, "step": 1385 }, { "completion_length": 107.8125, "epoch": 0.7415730337078652, "grad_norm": 1.181353211402893, "kl": 0.16847330331802368, "learning_rate": 4.677407698166477e-06, "loss": 0.0067, "reward": 2.78125, "reward_std": 0.43338578939437866, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.5, "step": 1386 }, { "completion_length": 111.625, "epoch": 0.7421080791867308, "grad_norm": 0.769601583480835, "kl": 0.19293837249279022, "learning_rate": 4.676642503755232e-06, "loss": 0.0077, "reward": 2.570187568664551, "reward_std": 0.5308808088302612, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.492062509059906, "step": 1387 }, { "completion_length": 130.40625, "epoch": 0.7426431246655966, "grad_norm": 2.5194835662841797, "kl": 0.2592165470123291, "learning_rate": 4.675876465635644e-06, "loss": 0.0104, "reward": 1.8339061737060547, "reward_std": 0.6589592099189758, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.49015623331069946, "step": 1388 }, { "completion_length": 127.625, "epoch": 0.7431781701444623, "grad_norm": 0.655585527420044, "kl": 0.1512501984834671, "learning_rate": 4.67510958410464e-06, "loss": 0.006, "reward": 1.7578125, "reward_std": 0.9110718965530396, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4765625, "step": 1389 }, { "completion_length": 141.625, "epoch": 0.7437132156233279, "grad_norm": 1.968310832977295, "kl": 0.19168543815612793, "learning_rate": 4.674341859459482e-06, "loss": 0.0077, "reward": 1.5258437395095825, "reward_std": 0.9945640563964844, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4008437395095825, "step": 1390 }, { "completion_length": 136.4375, "epoch": 0.7442482611021937, "grad_norm": 1.1011320352554321, "kl": 0.1896960735321045, "learning_rate": 4.673573291997754e-06, "loss": 0.0076, "reward": 1.58984375, "reward_std": 0.49412858486175537, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.44921875, "step": 1391 }, { "completion_length": 151.46875, "epoch": 0.7447833065810594, "grad_norm": 1.7050931453704834, "kl": 0.2215871959924698, "learning_rate": 4.672803882017365e-06, "loss": 0.0089, "reward": 1.1968750953674316, "reward_std": 0.6933515071868896, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.40000003576278687, "step": 1392 }, { "completion_length": 125.78125, "epoch": 0.7453183520599251, "grad_norm": 1.9415444135665894, "kl": 0.26692748069763184, "learning_rate": 4.672033629816556e-06, "loss": 0.0107, "reward": 2.1639063358306885, "reward_std": 0.593234658241272, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4607812464237213, "step": 1393 }, { "completion_length": 119.96875, "epoch": 0.7458533975387908, "grad_norm": 2.698237180709839, "kl": 0.2763079106807709, "learning_rate": 4.671262535693889e-06, "loss": 0.0111, "reward": 1.7656562328338623, "reward_std": 0.5185074210166931, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4687812626361847, "step": 1394 }, { "completion_length": 128.5, "epoch": 0.7463884430176565, "grad_norm": 0.5130787491798401, "kl": 0.167728453874588, "learning_rate": 4.6704905999482575e-06, "loss": 0.0067, "reward": 2.301374912261963, "reward_std": 0.7617305517196655, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.48887500166893005, "step": 1395 }, { "completion_length": 121.4375, "epoch": 0.7469234884965222, "grad_norm": 1.9001911878585815, "kl": 0.20121459662914276, "learning_rate": 4.6697178228788755e-06, "loss": 0.008, "reward": 2.379406213760376, "reward_std": 0.8674001693725586, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.45753124356269836, "step": 1396 }, { "completion_length": 115.90625, "epoch": 0.7474585339753879, "grad_norm": 4.261927127838135, "kl": 0.34377190470695496, "learning_rate": 4.6689442047852895e-06, "loss": 0.0138, "reward": 2.6683125495910645, "reward_std": 0.6166018843650818, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4964374899864197, "step": 1397 }, { "completion_length": 146.25, "epoch": 0.7479935794542536, "grad_norm": 16703892.0, "kl": 66755.7421875, "learning_rate": 4.668169745967366e-06, "loss": 2670.2302, "reward": 2.0047812461853027, "reward_std": 0.957224428653717, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4579062759876251, "step": 1398 }, { "completion_length": 133.4375, "epoch": 0.7485286249331193, "grad_norm": 1.9990843534469604, "kl": 0.18113014101982117, "learning_rate": 4.6673944467253016e-06, "loss": 0.0072, "reward": 1.9765625, "reward_std": 0.7666981816291809, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4609375, "step": 1399 }, { "completion_length": 134.90625, "epoch": 0.7490636704119851, "grad_norm": 2.0025124549865723, "kl": 0.1925472915172577, "learning_rate": 4.666618307359617e-06, "loss": 0.0077, "reward": 1.6015625, "reward_std": 0.7949979305267334, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4765625, "step": 1400 }, { "completion_length": 140.15625, "epoch": 0.7495987158908507, "grad_norm": 0.993449330329895, "kl": 0.20975366234779358, "learning_rate": 4.665841328171158e-06, "loss": 0.0084, "reward": 2.0295000076293945, "reward_std": 0.8373004198074341, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.45137497782707214, "step": 1401 }, { "completion_length": 136.375, "epoch": 0.7501337613697164, "grad_norm": 1.0207829475402832, "kl": 0.18531861901283264, "learning_rate": 4.665063509461098e-06, "loss": 0.0074, "reward": 2.2552499771118164, "reward_std": 1.2268359661102295, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4740000069141388, "step": 1402 }, { "completion_length": 160.125, "epoch": 0.7506688068485822, "grad_norm": 0.8918894529342651, "kl": 0.1774914562702179, "learning_rate": 4.664284851530931e-06, "loss": 0.0071, "reward": 1.7940937280654907, "reward_std": 0.9976012110710144, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4347187578678131, "step": 1403 }, { "completion_length": 126.1875, "epoch": 0.7512038523274478, "grad_norm": 1.0009480714797974, "kl": 0.2047116458415985, "learning_rate": 4.663505354682484e-06, "loss": 0.0082, "reward": 1.5792813301086426, "reward_std": 0.6666337251663208, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4855312705039978, "step": 1404 }, { "completion_length": 141.1875, "epoch": 0.7517388978063135, "grad_norm": 1.1322554349899292, "kl": 0.16709944605827332, "learning_rate": 4.662725019217903e-06, "loss": 0.0067, "reward": 1.8116874694824219, "reward_std": 0.8526776432991028, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.48356249928474426, "step": 1405 }, { "completion_length": 120.4375, "epoch": 0.7522739432851793, "grad_norm": 0.9022395610809326, "kl": 0.18206000328063965, "learning_rate": 4.661943845439662e-06, "loss": 0.0073, "reward": 1.9421250820159912, "reward_std": 0.4110163450241089, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47337502241134644, "step": 1406 }, { "completion_length": 132.59375, "epoch": 0.7528089887640449, "grad_norm": 1.6387420892715454, "kl": 0.2759650945663452, "learning_rate": 4.661161833650556e-06, "loss": 0.011, "reward": 2.34765625, "reward_std": 0.6520997285842896, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48828125, "step": 1407 }, { "completion_length": 142.9375, "epoch": 0.7533440342429106, "grad_norm": 2.6342551708221436, "kl": 0.1688046008348465, "learning_rate": 4.6603789841537105e-06, "loss": 0.0068, "reward": 1.6948437690734863, "reward_std": 0.5203487277030945, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.47609376907348633, "step": 1408 }, { "completion_length": 122.59375, "epoch": 0.7538790797217764, "grad_norm": 1.1034634113311768, "kl": 0.16988009214401245, "learning_rate": 4.659595297252573e-06, "loss": 0.0068, "reward": 2.122499942779541, "reward_std": 0.9045699834823608, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4662500023841858, "step": 1409 }, { "completion_length": 125.28125, "epoch": 0.7544141252006421, "grad_norm": 690201.0625, "kl": 131735.328125, "learning_rate": 4.658810773250913e-06, "loss": 5269.4146, "reward": 2.1015625, "reward_std": 0.352231502532959, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4609375, "step": 1410 }, { "completion_length": 150.03125, "epoch": 0.7549491706795077, "grad_norm": 160.95106506347656, "kl": 7.054842472076416, "learning_rate": 4.658025412452831e-06, "loss": 0.2822, "reward": 2.2145938873291016, "reward_std": 1.1686933040618896, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.480218768119812, "step": 1411 }, { "completion_length": 143.21875, "epoch": 0.7554842161583735, "grad_norm": 0.8271569609642029, "kl": 0.1847621649503708, "learning_rate": 4.657239215162745e-06, "loss": 0.0074, "reward": 1.9257187843322754, "reward_std": 0.303536593914032, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.488218754529953, "step": 1412 }, { "completion_length": 131.0625, "epoch": 0.7560192616372392, "grad_norm": 1.1983897686004639, "kl": 0.1888720840215683, "learning_rate": 4.6564521816854e-06, "loss": 0.0076, "reward": 2.3946876525878906, "reward_std": 0.8550981283187866, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4571875035762787, "step": 1413 }, { "completion_length": 126.28125, "epoch": 0.7565543071161048, "grad_norm": 0.9045872688293457, "kl": 0.17954625189304352, "learning_rate": 4.6556643123258674e-06, "loss": 0.0072, "reward": 2.40625, "reward_std": 0.757384717464447, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46875, "step": 1414 }, { "completion_length": 128.8125, "epoch": 0.7570893525949706, "grad_norm": 1.1305960416793823, "kl": 0.15611764788627625, "learning_rate": 4.654875607389539e-06, "loss": 0.0062, "reward": 2.4821250438690186, "reward_std": 0.6800485849380493, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4664999842643738, "step": 1415 }, { "completion_length": 129.65625, "epoch": 0.7576243980738363, "grad_norm": 2.371354341506958, "kl": 0.3561002016067505, "learning_rate": 4.654086067182131e-06, "loss": 0.0142, "reward": 2.2236876487731934, "reward_std": 0.6503652334213257, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4111874997615814, "step": 1416 }, { "completion_length": 143.65625, "epoch": 0.7581594435527019, "grad_norm": 1.073111891746521, "kl": 0.20099198818206787, "learning_rate": 4.653295692009686e-06, "loss": 0.008, "reward": 1.6443437337875366, "reward_std": 0.7389428615570068, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4568437337875366, "step": 1417 }, { "completion_length": 122.3125, "epoch": 0.7586944890315677, "grad_norm": 1.2963736057281494, "kl": 0.2287546694278717, "learning_rate": 4.652504482178569e-06, "loss": 0.0092, "reward": 2.693406105041504, "reward_std": 0.8267987370491028, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.49028125405311584, "step": 1418 }, { "completion_length": 131.5, "epoch": 0.7592295345104334, "grad_norm": 0.8636448383331299, "kl": 0.20150107145309448, "learning_rate": 4.651712437995466e-06, "loss": 0.0081, "reward": 1.6980311870574951, "reward_std": 0.4869370758533478, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4636562466621399, "step": 1419 }, { "completion_length": 133.9375, "epoch": 0.7597645799892991, "grad_norm": 1.096082329750061, "kl": 0.15285995602607727, "learning_rate": 4.650919559767389e-06, "loss": 0.0061, "reward": 1.6089999675750732, "reward_std": 0.5917670130729675, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.45274999737739563, "step": 1420 }, { "completion_length": 141.84375, "epoch": 0.7602996254681648, "grad_norm": 1.7248010635375977, "kl": 0.17670589685440063, "learning_rate": 4.650125847801675e-06, "loss": 0.0071, "reward": 1.6343437433242798, "reward_std": 0.9165283441543579, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4624687433242798, "step": 1421 }, { "completion_length": 142.3125, "epoch": 0.7608346709470305, "grad_norm": 1.986564040184021, "kl": 0.1515914499759674, "learning_rate": 4.6493313024059794e-06, "loss": 0.0061, "reward": 1.755312442779541, "reward_std": 0.9824106097221375, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4115625023841858, "step": 1422 }, { "completion_length": 143.0, "epoch": 0.7613697164258962, "grad_norm": 0.557988703250885, "kl": 0.16094616055488586, "learning_rate": 4.648535923888285e-06, "loss": 0.0064, "reward": 1.3162813186645508, "reward_std": 0.49943235516548157, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.441281259059906, "step": 1423 }, { "completion_length": 123.875, "epoch": 0.7619047619047619, "grad_norm": 2.0162408351898193, "kl": 0.1588585376739502, "learning_rate": 4.647739712556893e-06, "loss": 0.0064, "reward": 2.3927500247955322, "reward_std": 0.7054206132888794, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.45524999499320984, "step": 1424 }, { "completion_length": 125.8125, "epoch": 0.7624398073836276, "grad_norm": 0.8009841442108154, "kl": 0.16832111775875092, "learning_rate": 4.646942668720434e-06, "loss": 0.0067, "reward": 1.955718755722046, "reward_std": 0.507848858833313, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4713437557220459, "step": 1425 }, { "completion_length": 116.3125, "epoch": 0.7629748528624933, "grad_norm": 4.086243152618408, "kl": 0.4371888339519501, "learning_rate": 4.646144792687855e-06, "loss": 0.0175, "reward": 2.296750068664551, "reward_std": 0.7871233224868774, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.499875009059906, "step": 1426 }, { "completion_length": 144.625, "epoch": 0.763509898341359, "grad_norm": 1.3721489906311035, "kl": 0.1577291637659073, "learning_rate": 4.645346084768428e-06, "loss": 0.0063, "reward": 1.6929374933242798, "reward_std": 0.8287969827651978, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4116874933242798, "step": 1427 }, { "completion_length": 124.5, "epoch": 0.7640449438202247, "grad_norm": 0.4063170254230499, "kl": 0.16654306650161743, "learning_rate": 4.64454654527175e-06, "loss": 0.0067, "reward": 1.7267812490463257, "reward_std": 0.4372849464416504, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4767812490463257, "step": 1428 }, { "completion_length": 149.875, "epoch": 0.7645799892990904, "grad_norm": 0.6151107549667358, "kl": 0.16490325331687927, "learning_rate": 4.643746174507734e-06, "loss": 0.0066, "reward": 1.6756250858306885, "reward_std": 0.9167138338088989, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4412500262260437, "step": 1429 }, { "completion_length": 141.28125, "epoch": 0.7651150347779562, "grad_norm": 3.9798319339752197, "kl": 0.2533392906188965, "learning_rate": 4.642944972786622e-06, "loss": 0.0101, "reward": 1.7237813472747803, "reward_std": 1.0412527322769165, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4425312578678131, "step": 1430 }, { "completion_length": 143.875, "epoch": 0.7656500802568218, "grad_norm": 2.1925721168518066, "kl": 0.3430944085121155, "learning_rate": 4.642142940418973e-06, "loss": 0.0137, "reward": 1.984375, "reward_std": 0.9959288835525513, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.484375, "step": 1431 }, { "completion_length": 136.6875, "epoch": 0.7661851257356875, "grad_norm": 1.4619758129119873, "kl": 0.25790536403656006, "learning_rate": 4.641340077715673e-06, "loss": 0.0103, "reward": 2.76953125, "reward_std": 0.9187721014022827, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.48828125, "step": 1432 }, { "completion_length": 121.96875, "epoch": 0.7667201712145533, "grad_norm": 4.417191982269287, "kl": 0.2530064284801483, "learning_rate": 4.640536384987924e-06, "loss": 0.0101, "reward": 2.303406238555908, "reward_std": 0.9391374588012695, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4752812385559082, "step": 1433 }, { "completion_length": 141.8125, "epoch": 0.7672552166934189, "grad_norm": 2.810452461242676, "kl": 0.2644825279712677, "learning_rate": 4.639731862547254e-06, "loss": 0.0106, "reward": 2.0193750858306885, "reward_std": 1.0302197933197021, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4568749964237213, "step": 1434 }, { "completion_length": 120.21875, "epoch": 0.7677902621722846, "grad_norm": 1.113258957862854, "kl": 0.17972159385681152, "learning_rate": 4.638926510705511e-06, "loss": 0.0072, "reward": 2.3998124599456787, "reward_std": 1.037482500076294, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4935624897480011, "step": 1435 }, { "completion_length": 135.9375, "epoch": 0.7683253076511504, "grad_norm": 1.5949597358703613, "kl": 0.25175490975379944, "learning_rate": 4.638120329774866e-06, "loss": 0.0101, "reward": 1.8351249694824219, "reward_std": 0.8014534115791321, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.46012499928474426, "step": 1436 }, { "completion_length": 161.34375, "epoch": 0.7688603531300161, "grad_norm": 0.9170966148376465, "kl": 0.23498719930648804, "learning_rate": 4.637313320067808e-06, "loss": 0.0094, "reward": 1.1386250257492065, "reward_std": 0.47286638617515564, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.41987499594688416, "step": 1437 }, { "completion_length": 138.9375, "epoch": 0.7693953986088817, "grad_norm": 1.9795721769332886, "kl": 0.19152337312698364, "learning_rate": 4.636505481897151e-06, "loss": 0.0077, "reward": 1.9765625, "reward_std": 1.129835605621338, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4453125, "step": 1438 }, { "completion_length": 159.625, "epoch": 0.7699304440877475, "grad_norm": 2.5249550342559814, "kl": 0.1669209599494934, "learning_rate": 4.6356968155760285e-06, "loss": 0.0067, "reward": 2.0546875, "reward_std": 1.3341584205627441, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4296875, "step": 1439 }, { "completion_length": 113.03125, "epoch": 0.7704654895666132, "grad_norm": 1.6098659038543701, "kl": 0.16825570166110992, "learning_rate": 4.634887321417895e-06, "loss": 0.0067, "reward": 1.7137500047683716, "reward_std": 0.9383065104484558, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.4325000047683716, "step": 1440 }, { "completion_length": 145.78125, "epoch": 0.7710005350454788, "grad_norm": 0.9376636743545532, "kl": 0.1473657339811325, "learning_rate": 4.634076999736525e-06, "loss": 0.0059, "reward": 2.055906295776367, "reward_std": 1.0165588855743408, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4465312361717224, "step": 1441 }, { "completion_length": 129.65625, "epoch": 0.7715355805243446, "grad_norm": 1.0597429275512695, "kl": 0.19449350237846375, "learning_rate": 4.633265850846016e-06, "loss": 0.0078, "reward": 2.399656295776367, "reward_std": 0.8158059120178223, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4621562361717224, "step": 1442 }, { "completion_length": 123.9375, "epoch": 0.7720706260032103, "grad_norm": 21.327245712280273, "kl": 0.20531810820102692, "learning_rate": 4.632453875060784e-06, "loss": 0.0082, "reward": 2.5, "reward_std": 0.7644776105880737, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.5, "step": 1443 }, { "completion_length": 155.59375, "epoch": 0.7726056714820759, "grad_norm": 15.995689392089844, "kl": 0.27861282229423523, "learning_rate": 4.631641072695566e-06, "loss": 0.0111, "reward": 1.3501249551773071, "reward_std": 0.5324795246124268, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4438750147819519, "step": 1444 }, { "completion_length": 139.34375, "epoch": 0.7731407169609417, "grad_norm": 0.6302751898765564, "kl": 0.1483798772096634, "learning_rate": 4.630827444065421e-06, "loss": 0.0059, "reward": 2.27734375, "reward_std": 0.732881486415863, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46484375, "step": 1445 }, { "completion_length": 124.0625, "epoch": 0.7736757624398074, "grad_norm": 1.2460204362869263, "kl": 0.23001177608966827, "learning_rate": 4.630012989485727e-06, "loss": 0.0092, "reward": 2.1561875343322754, "reward_std": 0.43348151445388794, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.499937504529953, "step": 1446 }, { "completion_length": 152.3125, "epoch": 0.7742108079186731, "grad_norm": 0.821855902671814, "kl": 0.18520501255989075, "learning_rate": 4.629197709272181e-06, "loss": 0.0074, "reward": 1.5839062929153442, "reward_std": 1.1806368827819824, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.39640623331069946, "step": 1447 }, { "completion_length": 150.1875, "epoch": 0.7747458533975388, "grad_norm": 2.887439250946045, "kl": 0.1987294852733612, "learning_rate": 4.628381603740802e-06, "loss": 0.0079, "reward": 2.3154375553131104, "reward_std": 1.0801129341125488, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4248124957084656, "step": 1448 }, { "completion_length": 136.9375, "epoch": 0.7752808988764045, "grad_norm": 1.805570363998413, "kl": 0.15495836734771729, "learning_rate": 4.627564673207928e-06, "loss": 0.0062, "reward": 1.5217812061309814, "reward_std": 0.4648135006427765, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.42803123593330383, "step": 1449 }, { "completion_length": 116.71875, "epoch": 0.7758159443552702, "grad_norm": 4.579245090484619, "kl": 0.34879982471466064, "learning_rate": 4.626746917990217e-06, "loss": 0.014, "reward": 1.7415937185287476, "reward_std": 0.8015268445014954, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.47596874833106995, "step": 1450 }, { "completion_length": 134.5625, "epoch": 0.7763509898341359, "grad_norm": 0.8009027242660522, "kl": 0.15836752951145172, "learning_rate": 4.625928338404647e-06, "loss": 0.0063, "reward": 1.8125312328338623, "reward_std": 0.6520169377326965, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4531562328338623, "step": 1451 }, { "completion_length": 150.9375, "epoch": 0.7768860353130016, "grad_norm": 2.2603516578674316, "kl": 0.29812467098236084, "learning_rate": 4.625108934768514e-06, "loss": 0.0119, "reward": 1.6723437309265137, "reward_std": 0.8008057475090027, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.45359376072883606, "step": 1452 }, { "completion_length": 122.96875, "epoch": 0.7774210807918673, "grad_norm": 0.9133826494216919, "kl": 0.19880202412605286, "learning_rate": 4.624288707399436e-06, "loss": 0.008, "reward": 2.0345935821533203, "reward_std": 0.35644447803497314, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47209376096725464, "step": 1453 }, { "completion_length": 160.375, "epoch": 0.777956126270733, "grad_norm": 1.6293116807937622, "kl": 0.16557677090168, "learning_rate": 4.623467656615348e-06, "loss": 0.0066, "reward": 1.5611250400543213, "reward_std": 0.8932925462722778, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4205000102519989, "step": 1454 }, { "completion_length": 132.09375, "epoch": 0.7784911717495987, "grad_norm": 1.2266733646392822, "kl": 0.18702900409698486, "learning_rate": 4.622645782734504e-06, "loss": 0.0075, "reward": 1.8501875400543213, "reward_std": 0.6256508231163025, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.3970625102519989, "step": 1455 }, { "completion_length": 109.71875, "epoch": 0.7790262172284644, "grad_norm": 7.519050121307373, "kl": 0.1872653365135193, "learning_rate": 4.62182308607548e-06, "loss": 0.0075, "reward": 2.480062484741211, "reward_std": 0.7707450985908508, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.49568748474121094, "step": 1456 }, { "completion_length": 130.03125, "epoch": 0.7795612627073302, "grad_norm": 5.603870868682861, "kl": 0.5290459990501404, "learning_rate": 4.620999566957168e-06, "loss": 0.0212, "reward": 1.8010624647140503, "reward_std": 0.42589104175567627, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4260624945163727, "step": 1457 }, { "completion_length": 147.65625, "epoch": 0.7800963081861958, "grad_norm": 1.0796313285827637, "kl": 0.16052718460559845, "learning_rate": 4.620175225698781e-06, "loss": 0.0064, "reward": 1.2890625, "reward_std": 0.8302798867225647, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.3671875, "step": 1458 }, { "completion_length": 134.71875, "epoch": 0.7806313536650615, "grad_norm": 111827.2109375, "kl": 412.7781677246094, "learning_rate": 4.6193500626198465e-06, "loss": 16.5111, "reward": 2.359375, "reward_std": 0.9922075271606445, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46875, "step": 1459 }, { "completion_length": 135.8125, "epoch": 0.7811663991439273, "grad_norm": 0.8810490369796753, "kl": 0.19931992888450623, "learning_rate": 4.6185240780402165e-06, "loss": 0.008, "reward": 2.0, "reward_std": 0.7895898818969727, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.484375, "step": 1460 }, { "completion_length": 129.5625, "epoch": 0.7817014446227929, "grad_norm": 0.9507995247840881, "kl": 0.18652546405792236, "learning_rate": 4.617697272280057e-06, "loss": 0.0075, "reward": 2.41015625, "reward_std": 0.1811075508594513, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48828125, "step": 1461 }, { "completion_length": 140.25, "epoch": 0.7822364901016586, "grad_norm": 0.8906077146530151, "kl": 0.14642146229743958, "learning_rate": 4.616869645659855e-06, "loss": 0.0059, "reward": 2.289843797683716, "reward_std": 0.9521814584732056, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.44609373807907104, "step": 1462 }, { "completion_length": 152.125, "epoch": 0.7827715355805244, "grad_norm": 3.544616937637329, "kl": 0.15278027951717377, "learning_rate": 4.616041198500413e-06, "loss": 0.0061, "reward": 1.5922812223434448, "reward_std": 0.5058701634407043, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4516562521457672, "step": 1463 }, { "completion_length": 144.9375, "epoch": 0.78330658105939, "grad_norm": 2.1026041507720947, "kl": 0.13576515018939972, "learning_rate": 4.615211931122854e-06, "loss": 0.0054, "reward": 1.4394373893737793, "reward_std": 0.8110764026641846, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.42381250858306885, "step": 1464 }, { "completion_length": 137.34375, "epoch": 0.7838416265382557, "grad_norm": 1.5653598308563232, "kl": 0.20680652558803558, "learning_rate": 4.614381843848617e-06, "loss": 0.0083, "reward": 2.127781391143799, "reward_std": 0.49224525690078735, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4871562421321869, "step": 1465 }, { "completion_length": 132.3125, "epoch": 0.7843766720171215, "grad_norm": 0.8309110403060913, "kl": 0.269974023103714, "learning_rate": 4.61355093699946e-06, "loss": 0.0108, "reward": 1.7358437776565552, "reward_std": 0.5193212628364563, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4545937478542328, "step": 1466 }, { "completion_length": 129.0625, "epoch": 0.7849117174959872, "grad_norm": 5.354910850524902, "kl": 0.2262510508298874, "learning_rate": 4.612719210897459e-06, "loss": 0.0091, "reward": 1.9269062280654907, "reward_std": 0.9123594760894775, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4581562578678131, "step": 1467 }, { "completion_length": 160.34375, "epoch": 0.7854467629748528, "grad_norm": 1.6676713228225708, "kl": 0.17810454964637756, "learning_rate": 4.611886665865007e-06, "loss": 0.0071, "reward": 1.40625, "reward_std": 1.0404658317565918, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.359375, "step": 1468 }, { "completion_length": 134.34375, "epoch": 0.7859818084537186, "grad_norm": 0.8097301125526428, "kl": 0.15848836302757263, "learning_rate": 4.6110533022248136e-06, "loss": 0.0063, "reward": 2.013937473297119, "reward_std": 1.165050983428955, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.43581250309944153, "step": 1469 }, { "completion_length": 159.8125, "epoch": 0.7865168539325843, "grad_norm": 0.591751217842102, "kl": 0.13580375909805298, "learning_rate": 4.610219120299907e-06, "loss": 0.0054, "reward": 1.3698437213897705, "reward_std": 0.7077425718307495, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4323437511920929, "step": 1470 }, { "completion_length": 137.3125, "epoch": 0.7870518994114499, "grad_norm": 6.768417835235596, "kl": 0.31044939160346985, "learning_rate": 4.609384120413632e-06, "loss": 0.0124, "reward": 1.7421875, "reward_std": 0.5642480254173279, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4765625, "step": 1471 }, { "completion_length": 178.6875, "epoch": 0.7875869448903157, "grad_norm": 2.1294779777526855, "kl": 0.09866812825202942, "learning_rate": 4.608548302889651e-06, "loss": 0.0039, "reward": 1.2236875295639038, "reward_std": 1.0289103984832764, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.21875, "rewards/xmlcount_reward_func": 0.3018124997615814, "step": 1472 }, { "completion_length": 162.875, "epoch": 0.7881219903691814, "grad_norm": 1.2504005432128906, "kl": 0.12283752858638763, "learning_rate": 4.607711668051942e-06, "loss": 0.0049, "reward": 1.3306875228881836, "reward_std": 0.7725344896316528, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.3931874930858612, "step": 1473 }, { "completion_length": 156.1875, "epoch": 0.7886570358480471, "grad_norm": 3.570202350616455, "kl": 0.22681421041488647, "learning_rate": 4.606874216224801e-06, "loss": 0.0091, "reward": 1.0302813053131104, "reward_std": 0.8029956817626953, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.234375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.265625, "rewards/xmlcount_reward_func": 0.3427812457084656, "step": 1474 }, { "completion_length": 156.375, "epoch": 0.7891920813269128, "grad_norm": 0.3707447946071625, "kl": 0.11204474419355392, "learning_rate": 4.6060359477328416e-06, "loss": 0.0045, "reward": 1.7803750038146973, "reward_std": 0.37939155101776123, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.43662500381469727, "step": 1475 }, { "completion_length": 139.03125, "epoch": 0.7897271268057785, "grad_norm": 2.4259464740753174, "kl": 0.2975384593009949, "learning_rate": 4.605196862900991e-06, "loss": 0.0119, "reward": 1.5800937414169312, "reward_std": 1.0463647842407227, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.1875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.40821874141693115, "step": 1476 }, { "completion_length": 150.96875, "epoch": 0.7902621722846442, "grad_norm": 1.2536014318466187, "kl": 0.1335136443376541, "learning_rate": 4.604356962054495e-06, "loss": 0.0053, "reward": 1.862781286239624, "reward_std": 0.7779346704483032, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.40965625643730164, "step": 1477 }, { "completion_length": 142.6875, "epoch": 0.7907972177635099, "grad_norm": 1.3767077922821045, "kl": 0.2179417908191681, "learning_rate": 4.6035162455189165e-06, "loss": 0.0087, "reward": 2.293750047683716, "reward_std": 0.8690296411514282, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.48125001788139343, "step": 1478 }, { "completion_length": 137.1875, "epoch": 0.7913322632423756, "grad_norm": 1.1921947002410889, "kl": 0.1602851152420044, "learning_rate": 4.602674713620131e-06, "loss": 0.0064, "reward": 1.5708436965942383, "reward_std": 0.7813485860824585, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.44584375619888306, "step": 1479 }, { "completion_length": 152.0625, "epoch": 0.7918673087212413, "grad_norm": 1.9678574800491333, "kl": 0.18274684250354767, "learning_rate": 4.601832366684335e-06, "loss": 0.0073, "reward": 1.64453125, "reward_std": 0.8848206996917725, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.44140625, "step": 1480 }, { "completion_length": 145.59375, "epoch": 0.792402354200107, "grad_norm": 1.9851423501968384, "kl": 0.14657153189182281, "learning_rate": 4.600989205038037e-06, "loss": 0.0059, "reward": 1.2977187633514404, "reward_std": 0.7971574068069458, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.39146876335144043, "step": 1481 }, { "completion_length": 128.625, "epoch": 0.7929373996789727, "grad_norm": 1.0142841339111328, "kl": 0.24220381677150726, "learning_rate": 4.600145229008062e-06, "loss": 0.0097, "reward": 2.2784061431884766, "reward_std": 0.9638139009475708, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4502812623977661, "step": 1482 }, { "completion_length": 127.4375, "epoch": 0.7934724451578384, "grad_norm": 2.283796548843384, "kl": 0.1788589060306549, "learning_rate": 4.599300438921553e-06, "loss": 0.0072, "reward": 2.06640625, "reward_std": 1.027045726776123, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.45703125, "step": 1483 }, { "completion_length": 156.25, "epoch": 0.7940074906367042, "grad_norm": 3420.43017578125, "kl": 724.6160278320312, "learning_rate": 4.5984548351059645e-06, "loss": 28.9846, "reward": 1.6925313472747803, "reward_std": 1.0113524198532104, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4425312578678131, "step": 1484 }, { "completion_length": 158.03125, "epoch": 0.7945425361155698, "grad_norm": 258304753664.0, "kl": 5236619776.0, "learning_rate": 4.59760841788907e-06, "loss": 209464784.0, "reward": 1.388281226158142, "reward_std": 0.9277973175048828, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.3726562261581421, "step": 1485 }, { "completion_length": 131.15625, "epoch": 0.7950775815944355, "grad_norm": 1.629246473312378, "kl": 0.17425492405891418, "learning_rate": 4.596761187598958e-06, "loss": 0.007, "reward": 1.678093671798706, "reward_std": 0.7947598695755005, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.42809373140335083, "step": 1486 }, { "completion_length": 152.4375, "epoch": 0.7956126270733013, "grad_norm": 1.4502211809158325, "kl": 0.15094256401062012, "learning_rate": 4.59591314456403e-06, "loss": 0.006, "reward": 1.556687593460083, "reward_std": 0.9022508263587952, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.41606250405311584, "step": 1487 }, { "completion_length": 130.71875, "epoch": 0.7961476725521669, "grad_norm": 0.8018375635147095, "kl": 0.20601701736450195, "learning_rate": 4.595064289113003e-06, "loss": 0.0082, "reward": 2.212437629699707, "reward_std": 0.9165427684783936, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4468125104904175, "step": 1488 }, { "completion_length": 150.25, "epoch": 0.7966827180310326, "grad_norm": 0.8659740090370178, "kl": 0.15313854813575745, "learning_rate": 4.594214621574912e-06, "loss": 0.0061, "reward": 1.1756563186645508, "reward_std": 0.4479737877845764, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.410031259059906, "step": 1489 }, { "completion_length": 132.0, "epoch": 0.7972177635098984, "grad_norm": 168.38560485839844, "kl": 2.7442402839660645, "learning_rate": 4.593364142279103e-06, "loss": 0.1098, "reward": 2.0706562995910645, "reward_std": 1.268218755722046, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.4300312399864197, "step": 1490 }, { "completion_length": 128.5, "epoch": 0.797752808988764, "grad_norm": 1.5782878398895264, "kl": 0.19202512502670288, "learning_rate": 4.5925128515552375e-06, "loss": 0.0077, "reward": 1.926031231880188, "reward_std": 0.6539525985717773, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.488531231880188, "step": 1491 }, { "completion_length": 127.53125, "epoch": 0.7982878544676297, "grad_norm": 47.730831146240234, "kl": 0.6881322264671326, "learning_rate": 4.591660749733294e-06, "loss": 0.0275, "reward": 1.6450624465942383, "reward_std": 0.7312977313995361, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.41068750619888306, "step": 1492 }, { "completion_length": 127.15625, "epoch": 0.7988228999464955, "grad_norm": 0.7180033326148987, "kl": 0.16165602207183838, "learning_rate": 4.590807837143561e-06, "loss": 0.0065, "reward": 2.476156234741211, "reward_std": 1.0395530462265015, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46053123474121094, "step": 1493 }, { "completion_length": 137.59375, "epoch": 0.7993579454253612, "grad_norm": 289.69793701171875, "kl": 16.236360549926758, "learning_rate": 4.589954114116644e-06, "loss": 0.6495, "reward": 2.2150936126708984, "reward_std": 0.9613021016120911, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.433843731880188, "step": 1494 }, { "completion_length": 135.03125, "epoch": 0.7998929909042268, "grad_norm": 1.2903941869735718, "kl": 0.17669692635536194, "learning_rate": 4.589099580983465e-06, "loss": 0.0071, "reward": 2.549875020980835, "reward_std": 1.1661115884780884, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4404999911785126, "step": 1495 }, { "completion_length": 137.625, "epoch": 0.8004280363830926, "grad_norm": 1.1209717988967896, "kl": 0.14385788142681122, "learning_rate": 4.588244238075255e-06, "loss": 0.0058, "reward": 2.374875068664551, "reward_std": 1.079763412475586, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.484250009059906, "step": 1496 }, { "completion_length": 127.9375, "epoch": 0.8009630818619583, "grad_norm": 1.9386311769485474, "kl": 0.16582879424095154, "learning_rate": 4.587388085723563e-06, "loss": 0.0066, "reward": 2.209656238555908, "reward_std": 0.5792481303215027, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.3502812385559082, "step": 1497 }, { "completion_length": 134.96875, "epoch": 0.8014981273408239, "grad_norm": 1.9103505611419678, "kl": 0.24871258437633514, "learning_rate": 4.586531124260248e-06, "loss": 0.0099, "reward": 2.1009373664855957, "reward_std": 0.9900234937667847, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.265625, "rewards/xmlcount_reward_func": 0.39781251549720764, "step": 1498 }, { "completion_length": 121.9375, "epoch": 0.8020331728196897, "grad_norm": 4.140609264373779, "kl": 0.3287076950073242, "learning_rate": 4.585673354017487e-06, "loss": 0.0131, "reward": 2.5420937538146973, "reward_std": 1.237928032875061, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46396875381469727, "step": 1499 }, { "completion_length": 136.59375, "epoch": 0.8025682182985554, "grad_norm": 2.134119987487793, "kl": 0.16471238434314728, "learning_rate": 4.584814775327766e-06, "loss": 0.0066, "reward": 1.6224374771118164, "reward_std": 0.6891791820526123, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.3880624771118164, "step": 1500 }, { "completion_length": 147.84375, "epoch": 0.803103263777421, "grad_norm": 2.767320394515991, "kl": 0.20477065443992615, "learning_rate": 4.583955388523888e-06, "loss": 0.0082, "reward": 1.9422812461853027, "reward_std": 0.8748629689216614, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.28125, "rewards/xmlcount_reward_func": 0.37978124618530273, "step": 1501 }, { "completion_length": 126.28125, "epoch": 0.8036383092562868, "grad_norm": 1.5804747343063354, "kl": 0.2397814691066742, "learning_rate": 4.583095193938968e-06, "loss": 0.0096, "reward": 1.9180938005447388, "reward_std": 0.8329014778137207, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.371218740940094, "step": 1502 }, { "completion_length": 146.90625, "epoch": 0.8041733547351525, "grad_norm": 1.7462775707244873, "kl": 0.25630149245262146, "learning_rate": 4.582234191906432e-06, "loss": 0.0103, "reward": 1.0875937938690186, "reward_std": 0.4278267025947571, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.40009376406669617, "step": 1503 }, { "completion_length": 122.75, "epoch": 0.8047084002140182, "grad_norm": 0.9919488430023193, "kl": 0.17098557949066162, "learning_rate": 4.581372382760023e-06, "loss": 0.0068, "reward": 1.5304062366485596, "reward_std": 0.46848198771476746, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48353123664855957, "step": 1504 }, { "completion_length": 132.46875, "epoch": 0.8052434456928839, "grad_norm": 1.4276385307312012, "kl": 0.1613025665283203, "learning_rate": 4.5805097668337935e-06, "loss": 0.0065, "reward": 2.4260311126708984, "reward_std": 0.7419514656066895, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4416562616825104, "step": 1505 }, { "completion_length": 140.375, "epoch": 0.8057784911717496, "grad_norm": 0.8229833841323853, "kl": 0.1414877325296402, "learning_rate": 4.579646344462111e-06, "loss": 0.0057, "reward": 2.1116251945495605, "reward_std": 0.8440510630607605, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4710000157356262, "step": 1506 }, { "completion_length": 129.4375, "epoch": 0.8063135366506153, "grad_norm": 2.609978675842285, "kl": 0.19532868266105652, "learning_rate": 4.5787821159796535e-06, "loss": 0.0078, "reward": 2.0073752403259277, "reward_std": 0.9710245132446289, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46050000190734863, "step": 1507 }, { "completion_length": 134.8125, "epoch": 0.806848582129481, "grad_norm": 2.237220287322998, "kl": 0.18633511662483215, "learning_rate": 4.577917081721414e-06, "loss": 0.0075, "reward": 1.7310937643051147, "reward_std": 1.0604168176651, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.41859376430511475, "step": 1508 }, { "completion_length": 146.625, "epoch": 0.8073836276083467, "grad_norm": 1.1388750076293945, "kl": 0.13990378379821777, "learning_rate": 4.577051242022693e-06, "loss": 0.0056, "reward": 1.9210624694824219, "reward_std": 1.0610504150390625, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4366874694824219, "step": 1509 }, { "completion_length": 140.375, "epoch": 0.8079186730872124, "grad_norm": 8.625699996948242, "kl": 0.5903922915458679, "learning_rate": 4.576184597219111e-06, "loss": 0.0236, "reward": 2.0583748817443848, "reward_std": 0.6725653409957886, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4802500009536743, "step": 1510 }, { "completion_length": 125.6875, "epoch": 0.8084537185660781, "grad_norm": 0.9717795848846436, "kl": 0.15567809343338013, "learning_rate": 4.575317147646593e-06, "loss": 0.0062, "reward": 2.4442811012268066, "reward_std": 0.58965665102005, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.4911562502384186, "step": 1511 }, { "completion_length": 132.3125, "epoch": 0.8089887640449438, "grad_norm": 0.89247065782547, "kl": 0.19221018254756927, "learning_rate": 4.57444889364138e-06, "loss": 0.0077, "reward": 2.0266873836517334, "reward_std": 1.0992999076843262, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.43293750286102295, "step": 1512 }, { "completion_length": 120.59375, "epoch": 0.8095238095238095, "grad_norm": 0.6792510747909546, "kl": 0.15641751885414124, "learning_rate": 4.5735798355400245e-06, "loss": 0.0063, "reward": 1.9950000047683716, "reward_std": 0.6061429977416992, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4793750047683716, "step": 1513 }, { "completion_length": 126.53125, "epoch": 0.8100588550026753, "grad_norm": 53862308.0, "kl": 85144.75, "learning_rate": 4.572709973679389e-06, "loss": 3405.7891, "reward": 2.6130313873291016, "reward_std": 0.9615206718444824, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.488031268119812, "step": 1514 }, { "completion_length": 153.0, "epoch": 0.8105939004815409, "grad_norm": 7.533525466918945, "kl": 0.8126378655433655, "learning_rate": 4.571839308396649e-06, "loss": 0.0325, "reward": 1.7760624885559082, "reward_std": 0.5753134489059448, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4166874885559082, "step": 1515 }, { "completion_length": 111.96875, "epoch": 0.8111289459604066, "grad_norm": 1.0839687585830688, "kl": 0.22813165187835693, "learning_rate": 4.570967840029292e-06, "loss": 0.0091, "reward": 1.5311250686645508, "reward_std": 0.5959504246711731, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.499875009059906, "step": 1516 }, { "completion_length": 159.3125, "epoch": 0.8116639914392724, "grad_norm": 6.048390865325928, "kl": 0.4309840500354767, "learning_rate": 4.570095568915115e-06, "loss": 0.0172, "reward": 0.8236562609672546, "reward_std": 0.40357112884521484, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.09375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.41740626096725464, "step": 1517 }, { "completion_length": 128.1875, "epoch": 0.812199036918138, "grad_norm": 1.393195629119873, "kl": 0.1776350736618042, "learning_rate": 4.569222495392227e-06, "loss": 0.0071, "reward": 2.1193125247955322, "reward_std": 0.6026460528373718, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.49431249499320984, "step": 1518 }, { "completion_length": 121.78125, "epoch": 0.8127340823970037, "grad_norm": 1.4376277923583984, "kl": 0.1918668895959854, "learning_rate": 4.568348619799048e-06, "loss": 0.0077, "reward": 1.9989374876022339, "reward_std": 0.8243535757064819, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4989374876022339, "step": 1519 }, { "completion_length": 155.9375, "epoch": 0.8132691278758695, "grad_norm": 0.7113217115402222, "kl": 0.15761899948120117, "learning_rate": 4.567473942474308e-06, "loss": 0.0063, "reward": 0.7434375286102295, "reward_std": 0.3296242356300354, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.171875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.234375, "rewards/xmlcount_reward_func": 0.3371874988079071, "step": 1520 }, { "completion_length": 125.96875, "epoch": 0.8138041733547352, "grad_norm": 1.7736800909042358, "kl": 0.15492400527000427, "learning_rate": 4.566598463757051e-06, "loss": 0.0062, "reward": 1.547374963760376, "reward_std": 0.6948650479316711, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46924999356269836, "step": 1521 }, { "completion_length": 139.96875, "epoch": 0.8143392188336008, "grad_norm": 1.0461381673812866, "kl": 0.19310246407985687, "learning_rate": 4.565722183986627e-06, "loss": 0.0077, "reward": 1.9686250686645508, "reward_std": 0.6089770197868347, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.468625009059906, "step": 1522 }, { "completion_length": 150.84375, "epoch": 0.8148742643124666, "grad_norm": 59.510169982910156, "kl": 0.31626349687576294, "learning_rate": 4.564845103502701e-06, "loss": 0.0127, "reward": 1.4595625400543213, "reward_std": 0.7479912042617798, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4283125102519989, "step": 1523 }, { "completion_length": 158.125, "epoch": 0.8154093097913323, "grad_norm": 2.27524995803833, "kl": 0.19101645052433014, "learning_rate": 4.563967222645244e-06, "loss": 0.0076, "reward": 1.6256250143051147, "reward_std": 0.9741418361663818, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.37562498450279236, "step": 1524 }, { "completion_length": 134.0625, "epoch": 0.8159443552701979, "grad_norm": 0.9904637932777405, "kl": 0.2677461802959442, "learning_rate": 4.5630885417545414e-06, "loss": 0.0107, "reward": 2.390625, "reward_std": 0.8808801174163818, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.46875, "step": 1525 }, { "completion_length": 146.5, "epoch": 0.8164794007490637, "grad_norm": 1.0197038650512695, "kl": 0.1358419954776764, "learning_rate": 4.562209061171185e-06, "loss": 0.0054, "reward": 1.534468650817871, "reward_std": 0.9134243130683899, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.40946877002716064, "step": 1526 }, { "completion_length": 137.6875, "epoch": 0.8170144462279294, "grad_norm": 0.9571510553359985, "kl": 0.23098811507225037, "learning_rate": 4.56132878123608e-06, "loss": 0.0092, "reward": 1.6563124656677246, "reward_std": 0.559026300907135, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.468812495470047, "step": 1527 }, { "completion_length": 121.40625, "epoch": 0.817549491706795, "grad_norm": 2.0469558238983154, "kl": 0.2752302289009094, "learning_rate": 4.560447702290438e-06, "loss": 0.011, "reward": 2.203125, "reward_std": 0.7179440259933472, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.5, "step": 1528 }, { "completion_length": 138.8125, "epoch": 0.8180845371856608, "grad_norm": 1.759224772453308, "kl": 0.292511522769928, "learning_rate": 4.559565824675783e-06, "loss": 0.0117, "reward": 2.0814688205718994, "reward_std": 0.7590776681900024, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.44084376096725464, "step": 1529 }, { "completion_length": 142.34375, "epoch": 0.8186195826645265, "grad_norm": 3.9325859546661377, "kl": 0.20611366629600525, "learning_rate": 4.558683148733949e-06, "loss": 0.0082, "reward": 1.657249927520752, "reward_std": 0.666230320930481, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.42287498712539673, "step": 1530 }, { "completion_length": 151.8125, "epoch": 0.8191546281433922, "grad_norm": 1.584647297859192, "kl": 0.14384490251541138, "learning_rate": 4.557799674807076e-06, "loss": 0.0058, "reward": 1.8402187824249268, "reward_std": 1.0470829010009766, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.43396875262260437, "step": 1531 }, { "completion_length": 139.0, "epoch": 0.8196896736222579, "grad_norm": 11966140.0, "kl": 412659.65625, "learning_rate": 4.556915403237616e-06, "loss": 16506.3887, "reward": 1.6963750123977661, "reward_std": 0.7827565670013428, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4620000123977661, "step": 1532 }, { "completion_length": 122.5625, "epoch": 0.8202247191011236, "grad_norm": 2.173969268798828, "kl": 0.17688851058483124, "learning_rate": 4.55603033436833e-06, "loss": 0.0071, "reward": 2.351062536239624, "reward_std": 0.5671406388282776, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47606250643730164, "step": 1533 }, { "completion_length": 140.75, "epoch": 0.8207597645799893, "grad_norm": 47144.9765625, "kl": 37.96331787109375, "learning_rate": 4.555144468542287e-06, "loss": 1.5185, "reward": 1.5880625247955322, "reward_std": 0.9955516457557678, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.43181249499320984, "step": 1534 }, { "completion_length": 120.375, "epoch": 0.821294810058855, "grad_norm": 3.3536221981048584, "kl": 0.21492713689804077, "learning_rate": 4.554257806102867e-06, "loss": 0.0086, "reward": 2.571500062942505, "reward_std": 0.5852646231651306, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4621250033378601, "step": 1535 }, { "completion_length": 139.625, "epoch": 0.8218298555377207, "grad_norm": 5.948284149169922, "kl": 0.5955742001533508, "learning_rate": 4.553370347393756e-06, "loss": 0.0238, "reward": 1.2009999752044678, "reward_std": 0.4255707263946533, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.45100000500679016, "step": 1536 }, { "completion_length": 144.40625, "epoch": 0.8223649010165864, "grad_norm": 257.6431579589844, "kl": 0.26891109347343445, "learning_rate": 4.552482092758951e-06, "loss": 0.0108, "reward": 1.9257187843322754, "reward_std": 1.0007213354110718, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.425718754529953, "step": 1537 }, { "completion_length": 126.84375, "epoch": 0.8228999464954521, "grad_norm": 2.490870952606201, "kl": 0.176072359085083, "learning_rate": 4.5515930425427556e-06, "loss": 0.007, "reward": 1.6796875, "reward_std": 0.6311355829238892, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4765625, "step": 1538 }, { "completion_length": 137.40625, "epoch": 0.8234349919743178, "grad_norm": 1.0340230464935303, "kl": 0.17705605924129486, "learning_rate": 4.550703197089782e-06, "loss": 0.0071, "reward": 1.818718671798706, "reward_std": 0.7173473834991455, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.41246873140335083, "step": 1539 }, { "completion_length": 144.625, "epoch": 0.8239700374531835, "grad_norm": 1.3431185483932495, "kl": 0.1640772670507431, "learning_rate": 4.549812556744954e-06, "loss": 0.0066, "reward": 1.6911875009536743, "reward_std": 0.5594137907028198, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4724375009536743, "step": 1540 }, { "completion_length": 106.59375, "epoch": 0.8245050829320493, "grad_norm": 1.563502311706543, "kl": 0.2439035326242447, "learning_rate": 4.548921121853498e-06, "loss": 0.0098, "reward": 2.671875, "reward_std": 0.5070015788078308, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.5, "step": 1541 }, { "completion_length": 121.84375, "epoch": 0.8250401284109149, "grad_norm": 1.1756609678268433, "kl": 0.2004605531692505, "learning_rate": 4.548028892760953e-06, "loss": 0.008, "reward": 2.3323750495910645, "reward_std": 0.7708545923233032, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4886249899864197, "step": 1542 }, { "completion_length": 135.96875, "epoch": 0.8255751738897806, "grad_norm": 75.28291320800781, "kl": 23.015649795532227, "learning_rate": 4.547135869813163e-06, "loss": 0.9206, "reward": 1.3303749561309814, "reward_std": 0.746573269367218, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4553750157356262, "step": 1543 }, { "completion_length": 134.375, "epoch": 0.8261102193686464, "grad_norm": 1.7467772960662842, "kl": 0.22141319513320923, "learning_rate": 4.546242053356282e-06, "loss": 0.0089, "reward": 2.3526248931884766, "reward_std": 0.8826971650123596, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4463750123977661, "step": 1544 }, { "completion_length": 144.625, "epoch": 0.826645264847512, "grad_norm": 1.0638718605041504, "kl": 0.16663996875286102, "learning_rate": 4.545347443736769e-06, "loss": 0.0067, "reward": 1.68359375, "reward_std": 0.8123172521591187, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.46484375, "step": 1545 }, { "completion_length": 144.625, "epoch": 0.8271803103263777, "grad_norm": 8.883301734924316, "kl": 0.23960238695144653, "learning_rate": 4.544452041301394e-06, "loss": 0.0096, "reward": 2.0738437175750732, "reward_std": 0.6878887414932251, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.464468777179718, "step": 1546 }, { "completion_length": 133.71875, "epoch": 0.8277153558052435, "grad_norm": 1.5703328847885132, "kl": 0.1636185497045517, "learning_rate": 4.54355584639723e-06, "loss": 0.0065, "reward": 1.45703125, "reward_std": 0.3642374575138092, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.234375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.48828125, "step": 1547 }, { "completion_length": 145.15625, "epoch": 0.8282504012841091, "grad_norm": 0.7322735786437988, "kl": 0.16187578439712524, "learning_rate": 4.54265885937166e-06, "loss": 0.0065, "reward": 1.9665312767028809, "reward_std": 0.7683740854263306, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.38840624690055847, "step": 1548 }, { "completion_length": 116.78125, "epoch": 0.8287854467629748, "grad_norm": 2.318741798400879, "kl": 0.2083406150341034, "learning_rate": 4.541761080572373e-06, "loss": 0.0083, "reward": 1.6437499523162842, "reward_std": 0.5602309703826904, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.47187501192092896, "step": 1549 }, { "completion_length": 117.96875, "epoch": 0.8293204922418406, "grad_norm": 1.3000177145004272, "kl": 0.18785163760185242, "learning_rate": 4.540862510347367e-06, "loss": 0.0075, "reward": 2.5361876487731934, "reward_std": 0.8781756162643433, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4893124997615814, "step": 1550 }, { "completion_length": 120.78125, "epoch": 0.8298555377207063, "grad_norm": 1.2433470487594604, "kl": 0.309731125831604, "learning_rate": 4.5399631490449425e-06, "loss": 0.0124, "reward": 1.2886874675750732, "reward_std": 0.44015252590179443, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.47618749737739563, "step": 1551 }, { "completion_length": 118.0625, "epoch": 0.8303905831995719, "grad_norm": 10.083845138549805, "kl": 0.16303324699401855, "learning_rate": 4.539062997013711e-06, "loss": 0.0065, "reward": 3.0585312843322754, "reward_std": 0.7322006225585938, "rewards/correctness_reward_func": 1.6875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.496031254529953, "step": 1552 }, { "completion_length": 145.9375, "epoch": 0.8309256286784377, "grad_norm": 1227160.5, "kl": 1417.8194580078125, "learning_rate": 4.5381620546025875e-06, "loss": 56.7128, "reward": 2.2459373474121094, "reward_std": 0.9730486273765564, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4334375262260437, "step": 1553 }, { "completion_length": 137.375, "epoch": 0.8314606741573034, "grad_norm": 0.8830098509788513, "kl": 0.15355579555034637, "learning_rate": 4.537260322160796e-06, "loss": 0.0061, "reward": 1.8125, "reward_std": 0.7306488752365112, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.46875, "step": 1554 }, { "completion_length": 127.1875, "epoch": 0.831995719636169, "grad_norm": 4.408754825592041, "kl": 0.3039938509464264, "learning_rate": 4.536357800037864e-06, "loss": 0.0122, "reward": 1.6617188453674316, "reward_std": 0.7731602787971497, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4585937559604645, "step": 1555 }, { "completion_length": 130.25, "epoch": 0.8325307651150348, "grad_norm": 4.8458356857299805, "kl": 0.3247110843658447, "learning_rate": 4.535454488583626e-06, "loss": 0.013, "reward": 2.1866250038146973, "reward_std": 1.0365341901779175, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.49912500381469727, "step": 1556 }, { "completion_length": 123.03125, "epoch": 0.8330658105939005, "grad_norm": 0.6243240833282471, "kl": 0.20730121433734894, "learning_rate": 4.534550388148225e-06, "loss": 0.0083, "reward": 2.6221249103546143, "reward_std": 0.83266681432724, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4971249997615814, "step": 1557 }, { "completion_length": 171.09375, "epoch": 0.8336008560727662, "grad_norm": 1.0973584651947021, "kl": 0.12596897780895233, "learning_rate": 4.5336454990821055e-06, "loss": 0.005, "reward": 1.485374927520752, "reward_std": 1.2828984260559082, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.25, "rewards/xmlcount_reward_func": 0.36037498712539673, "step": 1558 }, { "completion_length": 128.0625, "epoch": 0.8341359015516319, "grad_norm": 0.5014882683753967, "kl": 0.15288925170898438, "learning_rate": 4.532739821736019e-06, "loss": 0.0061, "reward": 1.8695625066757202, "reward_std": 0.8956373929977417, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4789375066757202, "step": 1559 }, { "completion_length": 115.625, "epoch": 0.8346709470304976, "grad_norm": 1.3116796016693115, "kl": 0.1998971700668335, "learning_rate": 4.531833356461027e-06, "loss": 0.008, "reward": 1.5165936946868896, "reward_std": 0.6105121970176697, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4697187542915344, "step": 1560 }, { "completion_length": 116.875, "epoch": 0.8352059925093633, "grad_norm": 2.1656954288482666, "kl": 0.1941489577293396, "learning_rate": 4.530926103608491e-06, "loss": 0.0078, "reward": 2.65625, "reward_std": 1.0656503438949585, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.484375, "step": 1561 }, { "completion_length": 129.71875, "epoch": 0.835741037988229, "grad_norm": 0.9721166491508484, "kl": 0.15841126441955566, "learning_rate": 4.530018063530079e-06, "loss": 0.0063, "reward": 1.8876874446868896, "reward_std": 0.7143072485923767, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4658125042915344, "step": 1562 }, { "completion_length": 140.9375, "epoch": 0.8362760834670947, "grad_norm": 0.8114407658576965, "kl": 0.16720086336135864, "learning_rate": 4.529109236577764e-06, "loss": 0.0067, "reward": 1.9891562461853027, "reward_std": 1.1373093128204346, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.44228124618530273, "step": 1563 }, { "completion_length": 144.0, "epoch": 0.8368111289459604, "grad_norm": 1.5823760032653809, "kl": 0.1531250774860382, "learning_rate": 4.528199623103827e-06, "loss": 0.0061, "reward": 1.050687551498413, "reward_std": 0.5367183685302734, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.4100624918937683, "step": 1564 }, { "completion_length": 144.65625, "epoch": 0.8373461744248261, "grad_norm": 1.0232220888137817, "kl": 0.13171963393688202, "learning_rate": 4.527289223460851e-06, "loss": 0.0053, "reward": 1.5998125076293945, "reward_std": 0.5713021755218506, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.45918750762939453, "step": 1565 }, { "completion_length": 150.28125, "epoch": 0.8378812199036918, "grad_norm": 3.707866668701172, "kl": 0.15957331657409668, "learning_rate": 4.5263780380017234e-06, "loss": 0.0064, "reward": 1.7933125495910645, "reward_std": 1.2008874416351318, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4339374899864197, "step": 1566 }, { "completion_length": 111.65625, "epoch": 0.8384162653825575, "grad_norm": 1.0785197019577026, "kl": 0.202671617269516, "learning_rate": 4.525466067079637e-06, "loss": 0.0081, "reward": 2.609375, "reward_std": 0.9077122211456299, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.5, "step": 1567 }, { "completion_length": 115.96875, "epoch": 0.8389513108614233, "grad_norm": 1.7631988525390625, "kl": 0.31789082288742065, "learning_rate": 4.52455331104809e-06, "loss": 0.0127, "reward": 2.1724376678466797, "reward_std": 1.0788631439208984, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.48493748903274536, "step": 1568 }, { "completion_length": 125.625, "epoch": 0.8394863563402889, "grad_norm": 2.5367677211761475, "kl": 0.1950521320104599, "learning_rate": 4.523639770260884e-06, "loss": 0.0078, "reward": 2.1556875705718994, "reward_std": 0.898207426071167, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.48381251096725464, "step": 1569 }, { "completion_length": 126.5, "epoch": 0.8400214018191546, "grad_norm": 0.4691658914089203, "kl": 0.14050054550170898, "learning_rate": 4.522725445072125e-06, "loss": 0.0056, "reward": 1.9217500686645508, "reward_std": 0.5796011686325073, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.499875009059906, "step": 1570 }, { "completion_length": 132.375, "epoch": 0.8405564472980204, "grad_norm": 4.54967737197876, "kl": 0.1893821656703949, "learning_rate": 4.521810335836222e-06, "loss": 0.0076, "reward": 1.586031198501587, "reward_std": 0.8793193101882935, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4454062581062317, "step": 1571 }, { "completion_length": 131.71875, "epoch": 0.841091492776886, "grad_norm": 0.9069510102272034, "kl": 0.1694534420967102, "learning_rate": 4.520894442907891e-06, "loss": 0.0068, "reward": 2.4790937900543213, "reward_std": 1.2438693046569824, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4478437304496765, "step": 1572 }, { "completion_length": 137.25, "epoch": 0.8416265382557517, "grad_norm": 0.8688150644302368, "kl": 0.20616109669208527, "learning_rate": 4.519977766642147e-06, "loss": 0.0082, "reward": 1.7962812185287476, "reward_std": 0.8358140587806702, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.48378124833106995, "step": 1573 }, { "completion_length": 137.96875, "epoch": 0.8421615837346175, "grad_norm": 0.781317949295044, "kl": 0.18131250143051147, "learning_rate": 4.519060307394313e-06, "loss": 0.0073, "reward": 1.9829375743865967, "reward_std": 0.9426090717315674, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4673125147819519, "step": 1574 }, { "completion_length": 126.625, "epoch": 0.8426966292134831, "grad_norm": 4.213822364807129, "kl": 0.6588190793991089, "learning_rate": 4.5181420655200134e-06, "loss": 0.0264, "reward": 1.560906171798706, "reward_std": 0.8582451343536377, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4359062612056732, "step": 1575 }, { "completion_length": 137.25, "epoch": 0.8432316746923488, "grad_norm": 2.3191256523132324, "kl": 0.2938406467437744, "learning_rate": 4.517223041375176e-06, "loss": 0.0118, "reward": 2.2002811431884766, "reward_std": 0.9246636629104614, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4815312623977661, "step": 1576 }, { "completion_length": 132.75, "epoch": 0.8437667201712146, "grad_norm": 2.8279404640197754, "kl": 0.24945828318595886, "learning_rate": 4.516303235316033e-06, "loss": 0.01, "reward": 1.596093773841858, "reward_std": 0.7895544171333313, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4554687440395355, "step": 1577 }, { "completion_length": 106.96875, "epoch": 0.8443017656500803, "grad_norm": 0.6079077124595642, "kl": 0.1776258796453476, "learning_rate": 4.515382647699118e-06, "loss": 0.0071, "reward": 1.8826875686645508, "reward_std": 0.9313815832138062, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4608124792575836, "step": 1578 }, { "completion_length": 146.9375, "epoch": 0.8448368111289459, "grad_norm": 1.21575927734375, "kl": 0.26953524351119995, "learning_rate": 4.514461278881269e-06, "loss": 0.0108, "reward": 1.6203436851501465, "reward_std": 0.8165022134780884, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46409374475479126, "step": 1579 }, { "completion_length": 131.90625, "epoch": 0.8453718566078117, "grad_norm": 1.4840642213821411, "kl": 0.21198169887065887, "learning_rate": 4.513539129219626e-06, "loss": 0.0085, "reward": 2.42578125, "reward_std": 0.9304143190383911, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.48828125, "step": 1580 }, { "completion_length": 137.3125, "epoch": 0.8459069020866774, "grad_norm": 4.062705993652344, "kl": 0.20423109829425812, "learning_rate": 4.5126161990716325e-06, "loss": 0.0082, "reward": 1.887312650680542, "reward_std": 0.97198486328125, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.48106250166893005, "step": 1581 }, { "completion_length": 139.46875, "epoch": 0.846441947565543, "grad_norm": 4.411106109619141, "kl": 0.19424495100975037, "learning_rate": 4.511692488795032e-06, "loss": 0.0078, "reward": 1.6961562633514404, "reward_std": 0.723456084728241, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.44615626335144043, "step": 1582 }, { "completion_length": 117.875, "epoch": 0.8469769930444088, "grad_norm": 2.2695629596710205, "kl": 0.21983833611011505, "learning_rate": 4.5107679987478744e-06, "loss": 0.0088, "reward": 2.12681245803833, "reward_std": 0.6750486493110657, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.43931248784065247, "step": 1583 }, { "completion_length": 128.6875, "epoch": 0.8475120385232745, "grad_norm": 1.5560461282730103, "kl": 0.25382211804389954, "learning_rate": 4.509842729288509e-06, "loss": 0.0102, "reward": 1.652250051498413, "reward_std": 1.1426594257354736, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4491249918937683, "step": 1584 }, { "completion_length": 138.4375, "epoch": 0.8480470840021401, "grad_norm": 1.2567218542099, "kl": 0.16257935762405396, "learning_rate": 4.508916680775587e-06, "loss": 0.0065, "reward": 1.9439375400543213, "reward_std": 1.1594561338424683, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4283124804496765, "step": 1585 }, { "completion_length": 125.46875, "epoch": 0.8485821294810059, "grad_norm": 1.3335717916488647, "kl": 0.28349384665489197, "learning_rate": 4.5079898535680646e-06, "loss": 0.0113, "reward": 1.28515625, "reward_std": 0.42218226194381714, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.45703125, "step": 1586 }, { "completion_length": 119.375, "epoch": 0.8491171749598716, "grad_norm": 3.7052268981933594, "kl": 0.4168338179588318, "learning_rate": 4.507062248025196e-06, "loss": 0.0167, "reward": 1.402093768119812, "reward_std": 0.47015678882598877, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.464593768119812, "step": 1587 }, { "completion_length": 128.8125, "epoch": 0.8496522204387373, "grad_norm": 1.101745367050171, "kl": 0.17631152272224426, "learning_rate": 4.506133864506539e-06, "loss": 0.0071, "reward": 1.5028438568115234, "reward_std": 0.5647742748260498, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4403437376022339, "step": 1588 }, { "completion_length": 122.34375, "epoch": 0.850187265917603, "grad_norm": 0.8685482144355774, "kl": 0.19276806712150574, "learning_rate": 4.5052047033719545e-06, "loss": 0.0077, "reward": 2.275749921798706, "reward_std": 0.7334715723991394, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.46324998140335083, "step": 1589 }, { "completion_length": 128.4375, "epoch": 0.8507223113964687, "grad_norm": 5.081531524658203, "kl": 0.16674917936325073, "learning_rate": 4.504274764981601e-06, "loss": 0.0067, "reward": 2.03125, "reward_std": 0.5542854070663452, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.5, "step": 1590 }, { "completion_length": 140.625, "epoch": 0.8512573568753344, "grad_norm": 0.812029242515564, "kl": 0.20932376384735107, "learning_rate": 4.5033440496959415e-06, "loss": 0.0084, "reward": 1.4598125219345093, "reward_std": 0.7565438747406006, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4754375219345093, "step": 1591 }, { "completion_length": 128.90625, "epoch": 0.8517924023542001, "grad_norm": 1.2224968671798706, "kl": 0.19714295864105225, "learning_rate": 4.50241255787574e-06, "loss": 0.0079, "reward": 1.6210312843322754, "reward_std": 0.8473622798919678, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4335312247276306, "step": 1592 }, { "completion_length": 125.4375, "epoch": 0.8523274478330658, "grad_norm": 2.0894992351531982, "kl": 0.3651277422904968, "learning_rate": 4.501480289882059e-06, "loss": 0.0146, "reward": 2.787374973297119, "reward_std": 0.8158339858055115, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.47487500309944153, "step": 1593 }, { "completion_length": 120.96875, "epoch": 0.8528624933119315, "grad_norm": 7.138772010803223, "kl": 0.6602271795272827, "learning_rate": 4.5005472460762635e-06, "loss": 0.0264, "reward": 1.9403437376022339, "reward_std": 0.8920074701309204, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4403437674045563, "step": 1594 }, { "completion_length": 126.8125, "epoch": 0.8533975387907973, "grad_norm": 1.7880306243896484, "kl": 0.19519387185573578, "learning_rate": 4.4996134268200205e-06, "loss": 0.0078, "reward": 2.149437427520752, "reward_std": 0.9696841239929199, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.46193748712539673, "step": 1595 }, { "completion_length": 153.59375, "epoch": 0.8539325842696629, "grad_norm": 0.7003223299980164, "kl": 0.14307165145874023, "learning_rate": 4.498678832475295e-06, "loss": 0.0057, "reward": 1.5835312604904175, "reward_std": 0.9251044988632202, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4272812604904175, "step": 1596 }, { "completion_length": 124.46875, "epoch": 0.8544676297485286, "grad_norm": 3.636098623275757, "kl": 0.20651990175247192, "learning_rate": 4.497743463404354e-06, "loss": 0.0083, "reward": 1.8800938129425049, "reward_std": 0.7678779363632202, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4582187533378601, "step": 1597 }, { "completion_length": 142.71875, "epoch": 0.8550026752273944, "grad_norm": 2.750967502593994, "kl": 0.16941231489181519, "learning_rate": 4.4968073199697645e-06, "loss": 0.0068, "reward": 1.5115938186645508, "reward_std": 0.659896969795227, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.464718759059906, "step": 1598 }, { "completion_length": 136.09375, "epoch": 0.85553772070626, "grad_norm": 317.6622619628906, "kl": 3.518488645553589, "learning_rate": 4.495870402534394e-06, "loss": 0.1407, "reward": 2.4552500247955322, "reward_std": 0.9724465608596802, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.45524999499320984, "step": 1599 }, { "completion_length": 154.8125, "epoch": 0.8560727661851257, "grad_norm": 355.9539489746094, "kl": 14.969222068786621, "learning_rate": 4.494932711461411e-06, "loss": 0.5988, "reward": 1.3837499618530273, "reward_std": 0.858312726020813, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.43062499165534973, "step": 1600 }, { "completion_length": 135.65625, "epoch": 0.8566078116639915, "grad_norm": 2.3664438724517822, "kl": 0.25544795393943787, "learning_rate": 4.493994247114278e-06, "loss": 0.0102, "reward": 1.9765000343322754, "reward_std": 1.1451383829116821, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.460875004529953, "step": 1601 }, { "completion_length": 146.96875, "epoch": 0.8571428571428571, "grad_norm": 754478.0625, "kl": 2508.67724609375, "learning_rate": 4.493055009856767e-06, "loss": 100.3471, "reward": 1.1308125257492065, "reward_std": 0.2873239517211914, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.45893749594688416, "step": 1602 }, { "completion_length": 118.03125, "epoch": 0.8576779026217228, "grad_norm": 1.0668048858642578, "kl": 0.21052348613739014, "learning_rate": 4.492115000052941e-06, "loss": 0.0084, "reward": 2.104281187057495, "reward_std": 0.6345750689506531, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4636562466621399, "step": 1603 }, { "completion_length": 129.75, "epoch": 0.8582129481005886, "grad_norm": 0.8926990628242493, "kl": 0.2670402526855469, "learning_rate": 4.491174218067167e-06, "loss": 0.0107, "reward": 1.9920001029968262, "reward_std": 0.6627544164657593, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4607499837875366, "step": 1604 }, { "completion_length": 148.78125, "epoch": 0.8587479935794543, "grad_norm": 0.9301356077194214, "kl": 0.1394871324300766, "learning_rate": 4.49023266426411e-06, "loss": 0.0056, "reward": 1.4780625104904175, "reward_std": 0.6308099031448364, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.3999375104904175, "step": 1605 }, { "completion_length": 107.15625, "epoch": 0.8592830390583199, "grad_norm": 1.0562214851379395, "kl": 0.20272813737392426, "learning_rate": 4.489290339008734e-06, "loss": 0.0081, "reward": 1.796375036239624, "reward_std": 0.5479749441146851, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.49950000643730164, "step": 1606 }, { "completion_length": 134.09375, "epoch": 0.8598180845371857, "grad_norm": 1.7956931591033936, "kl": 0.16631817817687988, "learning_rate": 4.488347242666303e-06, "loss": 0.0067, "reward": 1.8708125352859497, "reward_std": 0.8743342757225037, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4333125054836273, "step": 1607 }, { "completion_length": 119.28125, "epoch": 0.8603531300160514, "grad_norm": 1.179260492324829, "kl": 0.19763755798339844, "learning_rate": 4.487403375602378e-06, "loss": 0.0079, "reward": 2.557374954223633, "reward_std": 0.7176085114479065, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4948750138282776, "step": 1608 }, { "completion_length": 142.875, "epoch": 0.860888175494917, "grad_norm": 14.461960792541504, "kl": 3.255930185317993, "learning_rate": 4.486458738182822e-06, "loss": 0.1302, "reward": 1.8559376001358032, "reward_std": 0.8386056423187256, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.41843751072883606, "step": 1609 }, { "completion_length": 135.75, "epoch": 0.8614232209737828, "grad_norm": 0.8220904469490051, "kl": 0.19795089960098267, "learning_rate": 4.485513330773793e-06, "loss": 0.0079, "reward": 2.238687515258789, "reward_std": 0.9905767440795898, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.47306251525878906, "step": 1610 }, { "completion_length": 119.40625, "epoch": 0.8619582664526485, "grad_norm": 1.9033467769622803, "kl": 0.1531013548374176, "learning_rate": 4.484567153741749e-06, "loss": 0.0061, "reward": 1.9555938243865967, "reward_std": 0.6557810306549072, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4555937349796295, "step": 1611 }, { "completion_length": 102.6875, "epoch": 0.8624933119315141, "grad_norm": 1.1074745655059814, "kl": 0.23710186779499054, "learning_rate": 4.483620207453446e-06, "loss": 0.0095, "reward": 2.216750144958496, "reward_std": 0.5810060501098633, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.49800002574920654, "step": 1612 }, { "completion_length": 155.6875, "epoch": 0.8630283574103799, "grad_norm": 2.074263334274292, "kl": 0.16174696385860443, "learning_rate": 4.4826724922759405e-06, "loss": 0.0065, "reward": 1.3968437910079956, "reward_std": 0.8380612134933472, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.28125, "rewards/xmlcount_reward_func": 0.39684373140335083, "step": 1613 }, { "completion_length": 121.90625, "epoch": 0.8635634028892456, "grad_norm": 1.1296294927597046, "kl": 0.2911314368247986, "learning_rate": 4.4817240085765834e-06, "loss": 0.0116, "reward": 1.8610000610351562, "reward_std": 0.9289439916610718, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4703750014305115, "step": 1614 }, { "completion_length": 131.03125, "epoch": 0.8640984483681113, "grad_norm": 0.8063473701477051, "kl": 0.13420771062374115, "learning_rate": 4.4807747567230255e-06, "loss": 0.0054, "reward": 1.9412500858306885, "reward_std": 0.8432976007461548, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4100000262260437, "step": 1615 }, { "completion_length": 145.90625, "epoch": 0.864633493846977, "grad_norm": 1.2441762685775757, "kl": 0.15485309064388275, "learning_rate": 4.479824737083215e-06, "loss": 0.0062, "reward": 1.8420000076293945, "reward_std": 0.8475812077522278, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.42012500762939453, "step": 1616 }, { "completion_length": 124.75, "epoch": 0.8651685393258427, "grad_norm": 2.035675048828125, "kl": 0.15088392794132233, "learning_rate": 4.4788739500253985e-06, "loss": 0.006, "reward": 2.065624952316284, "reward_std": 0.49568629264831543, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.42499998211860657, "step": 1617 }, { "completion_length": 125.09375, "epoch": 0.8657035848047084, "grad_norm": 3.075867176055908, "kl": 0.23314465582370758, "learning_rate": 4.477922395918118e-06, "loss": 0.0093, "reward": 1.8176562786102295, "reward_std": 0.8750048279762268, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4582812488079071, "step": 1618 }, { "completion_length": 139.46875, "epoch": 0.8662386302835741, "grad_norm": 0.8245086073875427, "kl": 0.15524554252624512, "learning_rate": 4.476970075130215e-06, "loss": 0.0062, "reward": 1.5368125438690186, "reward_std": 0.7148545384407043, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.44306251406669617, "step": 1619 }, { "completion_length": 134.1875, "epoch": 0.8667736757624398, "grad_norm": 0.764607846736908, "kl": 0.1796361356973648, "learning_rate": 4.4760169880308254e-06, "loss": 0.0072, "reward": 2.4475936889648438, "reward_std": 0.729092001914978, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4944687485694885, "step": 1620 }, { "completion_length": 140.4375, "epoch": 0.8673087212413055, "grad_norm": 0.9607394933700562, "kl": 0.14931811392307281, "learning_rate": 4.475063134989387e-06, "loss": 0.006, "reward": 1.4091250896453857, "reward_std": 0.7312867045402527, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.44037503004074097, "step": 1621 }, { "completion_length": 145.40625, "epoch": 0.8678437667201712, "grad_norm": 1.1943235397338867, "kl": 0.2154906988143921, "learning_rate": 4.47410851637563e-06, "loss": 0.0086, "reward": 2.061812400817871, "reward_std": 1.0318825244903564, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.45243752002716064, "step": 1622 }, { "completion_length": 139.625, "epoch": 0.8683788121990369, "grad_norm": 2.866452932357788, "kl": 0.19678041338920593, "learning_rate": 4.4731531325595835e-06, "loss": 0.0079, "reward": 2.0078125, "reward_std": 0.448444128036499, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4765625, "step": 1623 }, { "completion_length": 122.46875, "epoch": 0.8689138576779026, "grad_norm": 1.302825927734375, "kl": 0.1563129425048828, "learning_rate": 4.472196983911571e-06, "loss": 0.0063, "reward": 1.88671875, "reward_std": 0.7917191982269287, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.49609375, "step": 1624 }, { "completion_length": 127.53125, "epoch": 0.8694489031567684, "grad_norm": 0.7758172750473022, "kl": 0.1594730168581009, "learning_rate": 4.471240070802216e-06, "loss": 0.0064, "reward": 2.235593795776367, "reward_std": 0.8454384207725525, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4074687361717224, "step": 1625 }, { "completion_length": 151.34375, "epoch": 0.869983948635634, "grad_norm": 0.6950615644454956, "kl": 0.13640782237052917, "learning_rate": 4.470282393602435e-06, "loss": 0.0055, "reward": 1.3220312595367432, "reward_std": 0.8435408473014832, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.41578125953674316, "step": 1626 }, { "completion_length": 122.6875, "epoch": 0.8705189941144997, "grad_norm": 1.0807777643203735, "kl": 0.1855405569076538, "learning_rate": 4.469323952683442e-06, "loss": 0.0074, "reward": 2.020250082015991, "reward_std": 0.7909431457519531, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47337499260902405, "step": 1627 }, { "completion_length": 138.75, "epoch": 0.8710540395933655, "grad_norm": 1.059369444847107, "kl": 0.16682085394859314, "learning_rate": 4.468364748416748e-06, "loss": 0.0067, "reward": 1.3319687843322754, "reward_std": 0.6702696084976196, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.441343754529953, "step": 1628 }, { "completion_length": 143.59375, "epoch": 0.8715890850722311, "grad_norm": 1.2135541439056396, "kl": 0.15200483798980713, "learning_rate": 4.4674047811741585e-06, "loss": 0.0061, "reward": 1.7224375009536743, "reward_std": 0.47491419315338135, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4411875009536743, "step": 1629 }, { "completion_length": 169.5625, "epoch": 0.8721241305510968, "grad_norm": 1.5808266401290894, "kl": 0.10383696854114532, "learning_rate": 4.4664440513277754e-06, "loss": 0.0042, "reward": 2.184093713760376, "reward_std": 1.0984234809875488, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.41846874356269836, "step": 1630 }, { "completion_length": 126.9375, "epoch": 0.8726591760299626, "grad_norm": 0.9151985049247742, "kl": 0.1990092396736145, "learning_rate": 4.465482559249996e-06, "loss": 0.008, "reward": 1.495093822479248, "reward_std": 0.5118600130081177, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.4169687330722809, "step": 1631 }, { "completion_length": 141.0, "epoch": 0.8731942215088283, "grad_norm": 0.6949145793914795, "kl": 0.1245662197470665, "learning_rate": 4.464520305313512e-06, "loss": 0.005, "reward": 1.786468744277954, "reward_std": 0.9689445495605469, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4583437442779541, "step": 1632 }, { "completion_length": 144.25, "epoch": 0.8737292669876939, "grad_norm": 0.5527955293655396, "kl": 0.13218635320663452, "learning_rate": 4.463557289891313e-06, "loss": 0.0053, "reward": 2.2295937538146973, "reward_std": 0.5754712820053101, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.47959375381469727, "step": 1633 }, { "completion_length": 127.65625, "epoch": 0.8742643124665597, "grad_norm": 2.733309745788574, "kl": 0.1943182796239853, "learning_rate": 4.462593513356681e-06, "loss": 0.0078, "reward": 1.8647187948226929, "reward_std": 0.7834404706954956, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4584687352180481, "step": 1634 }, { "completion_length": 129.90625, "epoch": 0.8747993579454254, "grad_norm": 0.5333341956138611, "kl": 0.1985965371131897, "learning_rate": 4.4616289760831954e-06, "loss": 0.0079, "reward": 2.25390625, "reward_std": 0.3258049190044403, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48828125, "step": 1635 }, { "completion_length": 118.6875, "epoch": 0.875334403424291, "grad_norm": 1.4644341468811035, "kl": 0.22032210230827332, "learning_rate": 4.460663678444729e-06, "loss": 0.0088, "reward": 2.2011876106262207, "reward_std": 0.9514398574829102, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.45118749141693115, "step": 1636 }, { "completion_length": 111.53125, "epoch": 0.8758694489031568, "grad_norm": 0.8199777603149414, "kl": 0.20850664377212524, "learning_rate": 4.459697620815449e-06, "loss": 0.0083, "reward": 1.5, "reward_std": 0.6647130846977234, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.5, "step": 1637 }, { "completion_length": 150.625, "epoch": 0.8764044943820225, "grad_norm": 3.5995054244995117, "kl": 0.5133098363876343, "learning_rate": 4.458730803569817e-06, "loss": 0.0205, "reward": 1.4608125686645508, "reward_std": 0.4159954786300659, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.429562509059906, "step": 1638 }, { "completion_length": 133.71875, "epoch": 0.8769395398608881, "grad_norm": 1036.9486083984375, "kl": 24.581010818481445, "learning_rate": 4.457763227082593e-06, "loss": 0.9832, "reward": 1.8350000381469727, "reward_std": 1.1654709577560425, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.46000000834465027, "step": 1639 }, { "completion_length": 145.1875, "epoch": 0.8774745853397539, "grad_norm": 2.086592197418213, "kl": 0.22343601286411285, "learning_rate": 4.456794891728825e-06, "loss": 0.0089, "reward": 1.1638751029968262, "reward_std": 0.7721163034439087, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.382625013589859, "step": 1640 }, { "completion_length": 129.15625, "epoch": 0.8780096308186196, "grad_norm": 1.0864077806472778, "kl": 0.205488920211792, "learning_rate": 4.45582579788386e-06, "loss": 0.0082, "reward": 1.94140625, "reward_std": 0.7830632925033569, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.48828125, "step": 1641 }, { "completion_length": 158.125, "epoch": 0.8785446762974853, "grad_norm": 0.6358566284179688, "kl": 0.22841942310333252, "learning_rate": 4.454855945923338e-06, "loss": 0.0091, "reward": 1.6073436737060547, "reward_std": 0.9609177112579346, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.38859373331069946, "step": 1642 }, { "completion_length": 152.0625, "epoch": 0.879079721776351, "grad_norm": 1.575571060180664, "kl": 0.27097275853157043, "learning_rate": 4.453885336223191e-06, "loss": 0.0108, "reward": 1.8204686641693115, "reward_std": 1.0626567602157593, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4298437535762787, "step": 1643 }, { "completion_length": 116.9375, "epoch": 0.8796147672552167, "grad_norm": 165910560768.0, "kl": 417928160.0, "learning_rate": 4.452913969159645e-06, "loss": 16717126.0, "reward": 2.1219687461853027, "reward_std": 0.73097163438797, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46571874618530273, "step": 1644 }, { "completion_length": 127.71875, "epoch": 0.8801498127340824, "grad_norm": 2.4678211212158203, "kl": 0.1871940791606903, "learning_rate": 4.451941845109222e-06, "loss": 0.0075, "reward": 2.0081875324249268, "reward_std": 0.9730182886123657, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.46131250262260437, "step": 1645 }, { "completion_length": 145.9375, "epoch": 0.8806848582129481, "grad_norm": 0.5313578844070435, "kl": 0.15910206735134125, "learning_rate": 4.450968964448735e-06, "loss": 0.0064, "reward": 1.9456875324249268, "reward_std": 0.6384711861610413, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.43006250262260437, "step": 1646 }, { "completion_length": 119.5, "epoch": 0.8812199036918138, "grad_norm": 2.1732001304626465, "kl": 0.41467443108558655, "learning_rate": 4.449995327555293e-06, "loss": 0.0166, "reward": 2.210750102996826, "reward_std": 0.7924202680587769, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.492000013589859, "step": 1647 }, { "completion_length": 110.375, "epoch": 0.8817549491706795, "grad_norm": 0.8898391723632812, "kl": 0.18398860096931458, "learning_rate": 4.449020934806294e-06, "loss": 0.0074, "reward": 2.398031234741211, "reward_std": 0.8391987085342407, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.47615623474121094, "step": 1648 }, { "completion_length": 117.25, "epoch": 0.8822899946495452, "grad_norm": 1.0775254964828491, "kl": 0.2766188979148865, "learning_rate": 4.4480457865794335e-06, "loss": 0.0111, "reward": 1.497093677520752, "reward_std": 0.6627904772758484, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.46584373712539673, "step": 1649 }, { "completion_length": 128.84375, "epoch": 0.8828250401284109, "grad_norm": 1.2636778354644775, "kl": 0.2675359845161438, "learning_rate": 4.447069883252696e-06, "loss": 0.0107, "reward": 1.6874375343322754, "reward_std": 1.022395133972168, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.25, "rewards/xmlcount_reward_func": 0.406187504529953, "step": 1650 }, { "completion_length": 116.28125, "epoch": 0.8833600856072766, "grad_norm": 6.3341450691223145, "kl": 0.914478063583374, "learning_rate": 4.44609322520436e-06, "loss": 0.0366, "reward": 1.5175937414169312, "reward_std": 0.7608456611633301, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.28125, "rewards/xmlcount_reward_func": 0.42384377121925354, "step": 1651 }, { "completion_length": 122.25, "epoch": 0.8838951310861424, "grad_norm": 1.6191951036453247, "kl": 0.31989091634750366, "learning_rate": 4.4451158128129985e-06, "loss": 0.0128, "reward": 2.432406425476074, "reward_std": 1.1396088600158691, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4636562466621399, "step": 1652 }, { "completion_length": 138.21875, "epoch": 0.884430176565008, "grad_norm": 1.2905811071395874, "kl": 0.15274667739868164, "learning_rate": 4.444137646457474e-06, "loss": 0.0061, "reward": 1.7282187938690186, "reward_std": 1.0565094947814941, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.140625, "rewards/xmlcount_reward_func": 0.3532187342643738, "step": 1653 }, { "completion_length": 120.90625, "epoch": 0.8849652220438737, "grad_norm": 4.920258045196533, "kl": 0.3349223732948303, "learning_rate": 4.443158726516943e-06, "loss": 0.0134, "reward": 2.6035313606262207, "reward_std": 0.9109714031219482, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.43165624141693115, "step": 1654 }, { "completion_length": 135.71875, "epoch": 0.8855002675227395, "grad_norm": 1.862505316734314, "kl": 0.12341786921024323, "learning_rate": 4.4421790533708545e-06, "loss": 0.0049, "reward": 1.355062484741211, "reward_std": 0.8290364146232605, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.15625, "rewards/xmlcount_reward_func": 0.32381248474121094, "step": 1655 }, { "completion_length": 110.53125, "epoch": 0.8860353130016051, "grad_norm": 1.5674195289611816, "kl": 0.22027906775474548, "learning_rate": 4.441198627398947e-06, "loss": 0.0088, "reward": 2.007406234741211, "reward_std": 0.9080035090446472, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.41365623474121094, "step": 1656 }, { "completion_length": 152.75, "epoch": 0.8865703584804708, "grad_norm": 1.0081952810287476, "kl": 0.1531856805086136, "learning_rate": 4.440217448981252e-06, "loss": 0.0061, "reward": 0.9732186794281006, "reward_std": 0.6765741109848022, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.171875, "rewards/xmlcount_reward_func": 0.27009373903274536, "step": 1657 }, { "completion_length": 145.84375, "epoch": 0.8871054039593366, "grad_norm": 1.2894848585128784, "kl": 0.1599048376083374, "learning_rate": 4.439235518498096e-06, "loss": 0.0064, "reward": 0.9443750381469727, "reward_std": 0.4541562795639038, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.15625, "rewards/xmlcount_reward_func": 0.30375000834465027, "step": 1658 }, { "completion_length": 130.75, "epoch": 0.8876404494382022, "grad_norm": 0.8025126457214355, "kl": 0.14680489897727966, "learning_rate": 4.438252836330091e-06, "loss": 0.0059, "reward": 1.2326562404632568, "reward_std": 0.595244288444519, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.1875, "rewards/xmlcount_reward_func": 0.3264062702655792, "step": 1659 }, { "completion_length": 145.4375, "epoch": 0.8881754949170679, "grad_norm": 0.9279918074607849, "kl": 0.14886754751205444, "learning_rate": 4.437269402858146e-06, "loss": 0.006, "reward": 1.457437515258789, "reward_std": 0.7444842457771301, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.3793124854564667, "step": 1660 }, { "completion_length": 117.90625, "epoch": 0.8887105403959337, "grad_norm": 3.1375856399536133, "kl": 0.27639633417129517, "learning_rate": 4.4362852184634555e-06, "loss": 0.0111, "reward": 1.7401562929153442, "reward_std": 0.8352799415588379, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.41203123331069946, "step": 1661 }, { "completion_length": 129.28125, "epoch": 0.8892455858747994, "grad_norm": 6.355554103851318, "kl": 0.15243421494960785, "learning_rate": 4.435300283527511e-06, "loss": 0.0061, "reward": 1.362781286239624, "reward_std": 0.5795766711235046, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.265625, "rewards/xmlcount_reward_func": 0.39403125643730164, "step": 1662 }, { "completion_length": 146.71875, "epoch": 0.889780631353665, "grad_norm": 1.4725793600082397, "kl": 0.16165515780448914, "learning_rate": 4.434314598432091e-06, "loss": 0.0065, "reward": 1.2790000438690186, "reward_std": 0.6478156447410583, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.21875, "rewards/xmlcount_reward_func": 0.31025001406669617, "step": 1663 }, { "completion_length": 109.0625, "epoch": 0.8903156768325308, "grad_norm": 0.9876142144203186, "kl": 0.16145920753479004, "learning_rate": 4.433328163559266e-06, "loss": 0.0065, "reward": 2.3441874980926514, "reward_std": 0.6430984735488892, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46918749809265137, "step": 1664 }, { "completion_length": 131.125, "epoch": 0.8908507223113965, "grad_norm": 0.9460629820823669, "kl": 0.14071545004844666, "learning_rate": 4.4323409792913975e-06, "loss": 0.0056, "reward": 1.712843656539917, "reward_std": 0.771905779838562, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.44721874594688416, "step": 1665 }, { "completion_length": 141.78125, "epoch": 0.8913857677902621, "grad_norm": 1.1323693990707397, "kl": 0.15308818221092224, "learning_rate": 4.4313530460111365e-06, "loss": 0.0061, "reward": 1.5511562824249268, "reward_std": 0.7471864223480225, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.28125, "rewards/xmlcount_reward_func": 0.37928125262260437, "step": 1666 }, { "completion_length": 147.625, "epoch": 0.8919208132691279, "grad_norm": 1.177818775177002, "kl": 0.2067728340625763, "learning_rate": 4.430364364101425e-06, "loss": 0.0083, "reward": 1.4640625715255737, "reward_std": 0.526314914226532, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.265625, "rewards/xmlcount_reward_func": 0.32343751192092896, "step": 1667 }, { "completion_length": 115.9375, "epoch": 0.8924558587479936, "grad_norm": 3.3931407928466797, "kl": 0.23521198332309723, "learning_rate": 4.429374933945496e-06, "loss": 0.0094, "reward": 2.2044687271118164, "reward_std": 0.6133701801300049, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4700937569141388, "step": 1668 }, { "completion_length": 130.65625, "epoch": 0.8929909042268592, "grad_norm": 9.472710609436035, "kl": 0.38005828857421875, "learning_rate": 4.42838475592687e-06, "loss": 0.0152, "reward": 1.843093752861023, "reward_std": 0.9584698677062988, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.40559375286102295, "step": 1669 }, { "completion_length": 137.84375, "epoch": 0.893525949705725, "grad_norm": 0.6716368794441223, "kl": 0.14956489205360413, "learning_rate": 4.427393830429359e-06, "loss": 0.006, "reward": 2.1290626525878906, "reward_std": 0.572338342666626, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4415625035762787, "step": 1670 }, { "completion_length": 151.15625, "epoch": 0.8940609951845907, "grad_norm": 0.9410247206687927, "kl": 0.21000060439109802, "learning_rate": 4.426402157837067e-06, "loss": 0.0084, "reward": 1.6252501010894775, "reward_std": 0.7476182579994202, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.4065000116825104, "step": 1671 }, { "completion_length": 141.15625, "epoch": 0.8945960406634564, "grad_norm": 1.5046073198318481, "kl": 0.2577880620956421, "learning_rate": 4.425409738534383e-06, "loss": 0.0103, "reward": 1.5719687938690186, "reward_std": 0.598316490650177, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.21875, "rewards/xmlcount_reward_func": 0.3532187342643738, "step": 1672 }, { "completion_length": 133.96875, "epoch": 0.8951310861423221, "grad_norm": 1.287421703338623, "kl": 0.14767375588417053, "learning_rate": 4.424416572905988e-06, "loss": 0.0059, "reward": 1.5770937204360962, "reward_std": 0.9449362754821777, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.42084378004074097, "step": 1673 }, { "completion_length": 146.71875, "epoch": 0.8956661316211878, "grad_norm": 1.4863699674606323, "kl": 0.18808318674564362, "learning_rate": 4.423422661336854e-06, "loss": 0.0075, "reward": 2.2000937461853027, "reward_std": 1.1209099292755127, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.41884374618530273, "step": 1674 }, { "completion_length": 145.25, "epoch": 0.8962011771000535, "grad_norm": 1.4523019790649414, "kl": 0.18740049004554749, "learning_rate": 4.4224280042122365e-06, "loss": 0.0075, "reward": 1.8289062976837158, "reward_std": 1.0367772579193115, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.43828123807907104, "step": 1675 }, { "completion_length": 139.1875, "epoch": 0.8967362225789192, "grad_norm": 1.1988871097564697, "kl": 0.15338021516799927, "learning_rate": 4.421432601917687e-06, "loss": 0.0061, "reward": 1.9665625095367432, "reward_std": 0.6047869920730591, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.28125, "rewards/xmlcount_reward_func": 0.35718750953674316, "step": 1676 }, { "completion_length": 123.28125, "epoch": 0.8972712680577849, "grad_norm": 1.0298402309417725, "kl": 0.18523354828357697, "learning_rate": 4.420436454839041e-06, "loss": 0.0074, "reward": 1.5404062271118164, "reward_std": 0.6703990697860718, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4310312271118164, "step": 1677 }, { "completion_length": 138.3125, "epoch": 0.8978063135366506, "grad_norm": 1.2769932746887207, "kl": 0.19586506485939026, "learning_rate": 4.419439563362425e-06, "loss": 0.0078, "reward": 2.4310624599456787, "reward_std": 1.1541640758514404, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4310625195503235, "step": 1678 }, { "completion_length": 152.625, "epoch": 0.8983413590155164, "grad_norm": 0.9081660509109497, "kl": 0.21381047368049622, "learning_rate": 4.418441927874253e-06, "loss": 0.0086, "reward": 1.1810312271118164, "reward_std": 0.5392601490020752, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.28125, "rewards/xmlcount_reward_func": 0.3685312271118164, "step": 1679 }, { "completion_length": 155.09375, "epoch": 0.898876404494382, "grad_norm": 0.7124467492103577, "kl": 0.14474600553512573, "learning_rate": 4.417443548761226e-06, "loss": 0.0058, "reward": 1.2305312156677246, "reward_std": 0.72370445728302, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.371156245470047, "step": 1680 }, { "completion_length": 133.46875, "epoch": 0.8994114499732477, "grad_norm": 0.7923385500907898, "kl": 0.16018570959568024, "learning_rate": 4.416444426410338e-06, "loss": 0.0064, "reward": 1.723406195640564, "reward_std": 0.7760542631149292, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.45778125524520874, "step": 1681 }, { "completion_length": 119.9375, "epoch": 0.8999464954521135, "grad_norm": 6.822381973266602, "kl": 0.23563653230667114, "learning_rate": 4.415444561208865e-06, "loss": 0.0094, "reward": 2.447499990463257, "reward_std": 0.6706820726394653, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.43187499046325684, "step": 1682 }, { "completion_length": 117.5, "epoch": 0.9004815409309791, "grad_norm": 0.806988537311554, "kl": 0.15441694855690002, "learning_rate": 4.414443953544376e-06, "loss": 0.0062, "reward": 2.6328749656677246, "reward_std": 0.7928217053413391, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.476624995470047, "step": 1683 }, { "completion_length": 137.875, "epoch": 0.9010165864098448, "grad_norm": 1.3266806602478027, "kl": 0.2003033310174942, "learning_rate": 4.413442603804724e-06, "loss": 0.008, "reward": 2.3003125190734863, "reward_std": 0.7599604725837708, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.45656251907348633, "step": 1684 }, { "completion_length": 122.53125, "epoch": 0.9015516318887106, "grad_norm": 1.5371248722076416, "kl": 0.1670612245798111, "learning_rate": 4.412440512378053e-06, "loss": 0.0067, "reward": 1.6608750820159912, "reward_std": 0.7529750466346741, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.44212496280670166, "step": 1685 }, { "completion_length": 127.75, "epoch": 0.9020866773675762, "grad_norm": 0.8580349683761597, "kl": 0.15195290744304657, "learning_rate": 4.411437679652791e-06, "loss": 0.0061, "reward": 1.5271250009536743, "reward_std": 0.7939974069595337, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4333750009536743, "step": 1686 }, { "completion_length": 131.6875, "epoch": 0.9026217228464419, "grad_norm": 0.8339689373970032, "kl": 0.30425918102264404, "learning_rate": 4.410434106017657e-06, "loss": 0.0122, "reward": 1.7636876106262207, "reward_std": 0.7354918718338013, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.45118749141693115, "step": 1687 }, { "completion_length": 129.6875, "epoch": 0.9031567683253077, "grad_norm": 1.8566538095474243, "kl": 0.2805631160736084, "learning_rate": 4.409429791861654e-06, "loss": 0.0112, "reward": 1.7979999780654907, "reward_std": 0.9275040626525879, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4542499780654907, "step": 1688 }, { "completion_length": 136.75, "epoch": 0.9036918138041734, "grad_norm": 0.6122969388961792, "kl": 0.13348375260829926, "learning_rate": 4.408424737574075e-06, "loss": 0.0053, "reward": 1.9511874914169312, "reward_std": 0.5461631417274475, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46681249141693115, "step": 1689 }, { "completion_length": 119.875, "epoch": 0.904226859283039, "grad_norm": 0.9575679302215576, "kl": 0.20273657143115997, "learning_rate": 4.407418943544497e-06, "loss": 0.0081, "reward": 2.3134686946868896, "reward_std": 0.830177903175354, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4853437542915344, "step": 1690 }, { "completion_length": 146.90625, "epoch": 0.9047619047619048, "grad_norm": 1.72266685962677, "kl": 0.16740216314792633, "learning_rate": 4.406412410162786e-06, "loss": 0.0067, "reward": 1.6820311546325684, "reward_std": 0.933803141117096, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.3695312440395355, "step": 1691 }, { "completion_length": 122.28125, "epoch": 0.9052969502407705, "grad_norm": 0.765997588634491, "kl": 0.20881308615207672, "learning_rate": 4.4054051378190915e-06, "loss": 0.0084, "reward": 2.1525938510894775, "reward_std": 0.9468134641647339, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.480718731880188, "step": 1692 }, { "completion_length": 130.6875, "epoch": 0.9058319957196361, "grad_norm": 0.8315994739532471, "kl": 0.16093501448631287, "learning_rate": 4.404397126903854e-06, "loss": 0.0064, "reward": 2.0016562938690186, "reward_std": 0.759629487991333, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.40790626406669617, "step": 1693 }, { "completion_length": 142.46875, "epoch": 0.9063670411985019, "grad_norm": 2.0057873725891113, "kl": 0.1954854130744934, "learning_rate": 4.403388377807796e-06, "loss": 0.0078, "reward": 1.8427499532699585, "reward_std": 0.8326542973518372, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.42087501287460327, "step": 1694 }, { "completion_length": 132.34375, "epoch": 0.9069020866773676, "grad_norm": 3.4317116737365723, "kl": 0.16497763991355896, "learning_rate": 4.40237889092193e-06, "loss": 0.0066, "reward": 2.2812187671661377, "reward_std": 0.7806167006492615, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4062187671661377, "step": 1695 }, { "completion_length": 141.1875, "epoch": 0.9074371321562332, "grad_norm": 0.7727159857749939, "kl": 0.1442790925502777, "learning_rate": 4.40136866663755e-06, "loss": 0.0058, "reward": 1.6745312213897705, "reward_std": 0.7156598567962646, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4245312213897705, "step": 1696 }, { "completion_length": 145.6875, "epoch": 0.907972177635099, "grad_norm": 0.6829155683517456, "kl": 0.11573144793510437, "learning_rate": 4.400357705346241e-06, "loss": 0.0046, "reward": 1.6414687633514404, "reward_std": 0.6682366132736206, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.48521876335144043, "step": 1697 }, { "completion_length": 130.34375, "epoch": 0.9085072231139647, "grad_norm": 0.6340619921684265, "kl": 0.17223501205444336, "learning_rate": 4.3993460074398685e-06, "loss": 0.0069, "reward": 1.239593744277954, "reward_std": 0.4736405611038208, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4583437442779541, "step": 1698 }, { "completion_length": 126.65625, "epoch": 0.9090422685928304, "grad_norm": 2.4984347820281982, "kl": 0.1789015531539917, "learning_rate": 4.398333573310588e-06, "loss": 0.0072, "reward": 2.1010937690734863, "reward_std": 0.7758955955505371, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.44484376907348633, "step": 1699 }, { "completion_length": 151.71875, "epoch": 0.9095773140716961, "grad_norm": 1.3374298810958862, "kl": 0.13953164219856262, "learning_rate": 4.397320403350837e-06, "loss": 0.0056, "reward": 1.3265312910079956, "reward_std": 0.6879876852035522, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.3734062612056732, "step": 1700 }, { "completion_length": 120.15625, "epoch": 0.9101123595505618, "grad_norm": 3.7366421222686768, "kl": 0.19912400841712952, "learning_rate": 4.39630649795334e-06, "loss": 0.008, "reward": 1.603968858718872, "reward_std": 0.467069536447525, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.4164687395095825, "step": 1701 }, { "completion_length": 130.4375, "epoch": 0.9106474050294275, "grad_norm": 5.703749179840088, "kl": 0.20578230917453766, "learning_rate": 4.395291857511106e-06, "loss": 0.0082, "reward": 1.527999997138977, "reward_std": 0.6046889424324036, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.41862499713897705, "step": 1702 }, { "completion_length": 120.28125, "epoch": 0.9111824505082932, "grad_norm": 0.4289214611053467, "kl": 0.20747539401054382, "learning_rate": 4.39427648241743e-06, "loss": 0.0083, "reward": 1.7032811641693115, "reward_std": 0.7370811104774475, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4689062535762787, "step": 1703 }, { "completion_length": 113.03125, "epoch": 0.9117174959871589, "grad_norm": 0.48398157954216003, "kl": 0.14451837539672852, "learning_rate": 4.39326037306589e-06, "loss": 0.0058, "reward": 2.55440616607666, "reward_std": 0.5679928064346313, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4762812554836273, "step": 1704 }, { "completion_length": 124.46875, "epoch": 0.9122525414660246, "grad_norm": 0.7913897633552551, "kl": 0.19113734364509583, "learning_rate": 4.392243529850348e-06, "loss": 0.0076, "reward": 2.405843734741211, "reward_std": 0.6460013389587402, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.43709373474121094, "step": 1705 }, { "completion_length": 150.25, "epoch": 0.9127875869448903, "grad_norm": 1.6628612279891968, "kl": 0.1529589593410492, "learning_rate": 4.3912259531649554e-06, "loss": 0.0061, "reward": 1.0625, "reward_std": 0.6806312203407288, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.390625, "step": 1706 }, { "completion_length": 128.5625, "epoch": 0.913322632423756, "grad_norm": 2.1891591548919678, "kl": 0.22519955039024353, "learning_rate": 4.390207643404142e-06, "loss": 0.009, "reward": 1.5495624542236328, "reward_std": 0.7969678044319153, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.4401875138282776, "step": 1707 }, { "completion_length": 128.21875, "epoch": 0.9138576779026217, "grad_norm": 1.1222797632217407, "kl": 0.24795064330101013, "learning_rate": 4.389188600962624e-06, "loss": 0.0099, "reward": 2.0060625076293945, "reward_std": 0.9654276371002197, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.42793747782707214, "step": 1708 }, { "completion_length": 140.25, "epoch": 0.9143927233814875, "grad_norm": 0.7549324035644531, "kl": 0.1810382604598999, "learning_rate": 4.388168826235404e-06, "loss": 0.0072, "reward": 2.3544063568115234, "reward_std": 1.0045884847640991, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4481562376022339, "step": 1709 }, { "completion_length": 128.21875, "epoch": 0.9149277688603531, "grad_norm": 1.1794484853744507, "kl": 0.2135278880596161, "learning_rate": 4.3871483196177635e-06, "loss": 0.0085, "reward": 2.023937463760376, "reward_std": 0.9980809092521667, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.461437463760376, "step": 1710 }, { "completion_length": 125.28125, "epoch": 0.9154628143392188, "grad_norm": 5.79110050201416, "kl": 0.34046676754951477, "learning_rate": 4.386127081505272e-06, "loss": 0.0136, "reward": 2.1982812881469727, "reward_std": 0.8469822406768799, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.44828125834465027, "step": 1711 }, { "completion_length": 116.1875, "epoch": 0.9159978598180846, "grad_norm": 1.1234228610992432, "kl": 0.21099819242954254, "learning_rate": 4.3851051122937795e-06, "loss": 0.0084, "reward": 2.0200624465942383, "reward_std": 0.6700142621994019, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.45756250619888306, "step": 1712 }, { "completion_length": 160.59375, "epoch": 0.9165329052969502, "grad_norm": 508.83319091796875, "kl": 0.8622794151306152, "learning_rate": 4.3840824123794236e-06, "loss": 0.0345, "reward": 1.3712812662124634, "reward_std": 0.9424479603767395, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.3712812662124634, "step": 1713 }, { "completion_length": 121.5625, "epoch": 0.9170679507758159, "grad_norm": 1.8715664148330688, "kl": 0.29560384154319763, "learning_rate": 4.38305898215862e-06, "loss": 0.0118, "reward": 1.8777186870574951, "reward_std": 0.9443146586418152, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.4245937466621399, "step": 1714 }, { "completion_length": 123.5, "epoch": 0.9176029962546817, "grad_norm": 1.0819331407546997, "kl": 0.19826193153858185, "learning_rate": 4.382034822028071e-06, "loss": 0.0079, "reward": 2.5593748092651367, "reward_std": 0.9158610105514526, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.40312498807907104, "step": 1715 }, { "completion_length": 140.6875, "epoch": 0.9181380417335474, "grad_norm": 1.122309684753418, "kl": 0.16672393679618835, "learning_rate": 4.38100993238476e-06, "loss": 0.0067, "reward": 1.7745938301086426, "reward_std": 0.8364566564559937, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.25, "rewards/xmlcount_reward_func": 0.399593710899353, "step": 1716 }, { "completion_length": 141.71875, "epoch": 0.918673087212413, "grad_norm": 1.0008540153503418, "kl": 0.17287901043891907, "learning_rate": 4.379984313625954e-06, "loss": 0.0069, "reward": 1.933437466621399, "reward_std": 1.0794782638549805, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.28125, "rewards/xmlcount_reward_func": 0.3865625262260437, "step": 1717 }, { "completion_length": 132.34375, "epoch": 0.9192081326912788, "grad_norm": 1.1275811195373535, "kl": 0.22574973106384277, "learning_rate": 4.3789579661492024e-06, "loss": 0.009, "reward": 1.6855937242507935, "reward_std": 1.113426923751831, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.38871872425079346, "step": 1718 }, { "completion_length": 121.1875, "epoch": 0.9197431781701445, "grad_norm": 0.8592840433120728, "kl": 0.19319051504135132, "learning_rate": 4.377930890352337e-06, "loss": 0.0077, "reward": 2.223875045776367, "reward_std": 0.8744800686836243, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4582499861717224, "step": 1719 }, { "completion_length": 132.78125, "epoch": 0.9202782236490101, "grad_norm": 2.202993392944336, "kl": 0.22677922248840332, "learning_rate": 4.376903086633473e-06, "loss": 0.0091, "reward": 1.9043126106262207, "reward_std": 0.8386082649230957, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.41993752121925354, "step": 1720 }, { "completion_length": 128.09375, "epoch": 0.9208132691278759, "grad_norm": 0.7808912396430969, "kl": 0.15812084078788757, "learning_rate": 4.3758745553910065e-06, "loss": 0.0063, "reward": 1.3902812004089355, "reward_std": 0.8969213366508484, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.3902812600135803, "step": 1721 }, { "completion_length": 142.21875, "epoch": 0.9213483146067416, "grad_norm": 0.7796744108200073, "kl": 0.15130531787872314, "learning_rate": 4.374845297023616e-06, "loss": 0.0061, "reward": 1.6813750267028809, "reward_std": 0.8533843159675598, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.234375, "rewards/xmlcount_reward_func": 0.40012499690055847, "step": 1722 }, { "completion_length": 140.40625, "epoch": 0.9218833600856072, "grad_norm": 1.0112961530685425, "kl": 0.1422506719827652, "learning_rate": 4.37381531193026e-06, "loss": 0.0057, "reward": 1.7922500371932983, "reward_std": 0.7482764720916748, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.265625, "rewards/xmlcount_reward_func": 0.35474997758865356, "step": 1723 }, { "completion_length": 148.5625, "epoch": 0.922418405564473, "grad_norm": 1.0834784507751465, "kl": 0.13349100947380066, "learning_rate": 4.372784600510183e-06, "loss": 0.0053, "reward": 1.5651562213897705, "reward_std": 0.9371335506439209, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.4089062511920929, "step": 1724 }, { "completion_length": 145.75, "epoch": 0.9229534510433387, "grad_norm": 1.4493424892425537, "kl": 0.14498767256736755, "learning_rate": 4.371753163162907e-06, "loss": 0.0058, "reward": 1.8760937452316284, "reward_std": 1.156821846961975, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.3917187452316284, "step": 1725 }, { "completion_length": 123.03125, "epoch": 0.9234884965222044, "grad_norm": 0.632162868976593, "kl": 0.1845470666885376, "learning_rate": 4.370721000288238e-06, "loss": 0.0074, "reward": 1.7274062633514404, "reward_std": 0.6796342134475708, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.28125, "rewards/xmlcount_reward_func": 0.38365626335144043, "step": 1726 }, { "completion_length": 152.375, "epoch": 0.9240235420010701, "grad_norm": 0.751734733581543, "kl": 0.13379576802253723, "learning_rate": 4.369688112286261e-06, "loss": 0.0054, "reward": 1.5214687585830688, "reward_std": 1.1192258596420288, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.21875, "rewards/xmlcount_reward_func": 0.33396875858306885, "step": 1727 }, { "completion_length": 124.78125, "epoch": 0.9245585874799358, "grad_norm": 1.9384034872055054, "kl": 0.31503555178642273, "learning_rate": 4.368654499557343e-06, "loss": 0.0126, "reward": 1.8158438205718994, "reward_std": 0.8572397828102112, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.44084376096725464, "step": 1728 }, { "completion_length": 132.96875, "epoch": 0.9250936329588015, "grad_norm": 0.8594684600830078, "kl": 0.13707134127616882, "learning_rate": 4.367620162502133e-06, "loss": 0.0055, "reward": 1.6257187128067017, "reward_std": 0.8189452886581421, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.43821874260902405, "step": 1729 }, { "completion_length": 131.40625, "epoch": 0.9256286784376672, "grad_norm": 2.0194344520568848, "kl": 0.3549225628376007, "learning_rate": 4.36658510152156e-06, "loss": 0.0142, "reward": 2.28334379196167, "reward_std": 1.0838251113891602, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.42396873235702515, "step": 1730 }, { "completion_length": 155.1875, "epoch": 0.9261637239165329, "grad_norm": 4.871286869049072, "kl": 0.13578858971595764, "learning_rate": 4.365549317016832e-06, "loss": 0.0054, "reward": 1.002906322479248, "reward_std": 0.5792841911315918, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.39353126287460327, "step": 1731 }, { "completion_length": 135.1875, "epoch": 0.9266987693953986, "grad_norm": 0.7377698421478271, "kl": 0.22099195420742035, "learning_rate": 4.364512809389441e-06, "loss": 0.0088, "reward": 2.4504687786102295, "reward_std": 0.7456958293914795, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4660937488079071, "step": 1732 }, { "completion_length": 111.6875, "epoch": 0.9272338148742643, "grad_norm": 0.977899968624115, "kl": 0.2204095870256424, "learning_rate": 4.363475579041155e-06, "loss": 0.0088, "reward": 2.1582813262939453, "reward_std": 0.9921512603759766, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.45515623688697815, "step": 1733 }, { "completion_length": 140.96875, "epoch": 0.92776886035313, "grad_norm": 1.3156582117080688, "kl": 0.20557233691215515, "learning_rate": 4.362437626374027e-06, "loss": 0.0082, "reward": 1.5597188472747803, "reward_std": 0.8736793398857117, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.265625, "rewards/xmlcount_reward_func": 0.3722187876701355, "step": 1734 }, { "completion_length": 159.75, "epoch": 0.9283039058319957, "grad_norm": 5.5502729415893555, "kl": 0.13555634021759033, "learning_rate": 4.361398951790385e-06, "loss": 0.0054, "reward": 1.2507812976837158, "reward_std": 0.6764481663703918, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.234375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.21875, "rewards/xmlcount_reward_func": 0.36015623807907104, "step": 1735 }, { "completion_length": 119.0, "epoch": 0.9288389513108615, "grad_norm": 0.6376078724861145, "kl": 0.19762933254241943, "learning_rate": 4.3603595556928405e-06, "loss": 0.0079, "reward": 2.110687494277954, "reward_std": 0.6703567504882812, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4700624942779541, "step": 1736 }, { "completion_length": 119.15625, "epoch": 0.9293739967897271, "grad_norm": 0.8869109749794006, "kl": 0.23502352833747864, "learning_rate": 4.359319438484282e-06, "loss": 0.0094, "reward": 1.273937463760376, "reward_std": 0.48789432644844055, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.44581249356269836, "step": 1737 }, { "completion_length": 113.78125, "epoch": 0.9299090422685928, "grad_norm": 22.144203186035156, "kl": 0.6286795735359192, "learning_rate": 4.35827860056788e-06, "loss": 0.0251, "reward": 1.5885624885559082, "reward_std": 0.7979773879051208, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4479374885559082, "step": 1738 }, { "completion_length": 145.0625, "epoch": 0.9304440877474586, "grad_norm": 0.8603532314300537, "kl": 0.1690385937690735, "learning_rate": 4.3572370423470835e-06, "loss": 0.0068, "reward": 1.9894688129425049, "reward_std": 0.7591201066970825, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.4425937533378601, "step": 1739 }, { "completion_length": 124.25, "epoch": 0.9309791332263242, "grad_norm": 1.8334039449691772, "kl": 0.18950216472148895, "learning_rate": 4.356194764225618e-06, "loss": 0.0076, "reward": 1.7772188186645508, "reward_std": 0.5935527086257935, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.464718759059906, "step": 1740 }, { "completion_length": 112.5, "epoch": 0.9315141787051899, "grad_norm": 1.2430635690689087, "kl": 0.33529743552207947, "learning_rate": 4.3551517666074945e-06, "loss": 0.0134, "reward": 1.6671875715255737, "reward_std": 0.551447331905365, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.47968748211860657, "step": 1741 }, { "completion_length": 119.46875, "epoch": 0.9320492241840557, "grad_norm": 0.6263246536254883, "kl": 0.20010694861412048, "learning_rate": 4.354108049896996e-06, "loss": 0.008, "reward": 2.0921876430511475, "reward_std": 0.6949583888053894, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4515624940395355, "step": 1742 }, { "completion_length": 113.96875, "epoch": 0.9325842696629213, "grad_norm": 0.9805403351783752, "kl": 0.2587941884994507, "learning_rate": 4.353063614498688e-06, "loss": 0.0104, "reward": 1.4641249179840088, "reward_std": 0.5851906538009644, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.46412503719329834, "step": 1743 }, { "completion_length": 129.875, "epoch": 0.933119315141787, "grad_norm": 1.149207353591919, "kl": 0.22585462033748627, "learning_rate": 4.3520184608174135e-06, "loss": 0.009, "reward": 1.46875, "reward_std": 0.5279657244682312, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.453125, "step": 1744 }, { "completion_length": 151.625, "epoch": 0.9336543606206528, "grad_norm": 1.3710439205169678, "kl": 0.1581931710243225, "learning_rate": 4.350972589258293e-06, "loss": 0.0063, "reward": 1.7254688739776611, "reward_std": 1.1040607690811157, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4442187547683716, "step": 1745 }, { "completion_length": 125.75, "epoch": 0.9341894060995185, "grad_norm": 0.5696851015090942, "kl": 0.2013353854417801, "learning_rate": 4.3499260002267295e-06, "loss": 0.0081, "reward": 2.11328125, "reward_std": 0.6587404608726501, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.48828125, "step": 1746 }, { "completion_length": 125.53125, "epoch": 0.9347244515783841, "grad_norm": 0.7176088094711304, "kl": 0.16816475987434387, "learning_rate": 4.3488786941283975e-06, "loss": 0.0067, "reward": 1.761218786239624, "reward_std": 0.8903815746307373, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.46434375643730164, "step": 1747 }, { "completion_length": 148.90625, "epoch": 0.9352594970572499, "grad_norm": 1.238542914390564, "kl": 0.14174935221672058, "learning_rate": 4.347830671369254e-06, "loss": 0.0057, "reward": 2.1295313835144043, "reward_std": 1.2479631900787354, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.45765626430511475, "step": 1748 }, { "completion_length": 128.84375, "epoch": 0.9357945425361156, "grad_norm": 0.7421091794967651, "kl": 0.16724321246147156, "learning_rate": 4.346781932355534e-06, "loss": 0.0067, "reward": 2.073031187057495, "reward_std": 0.721503734588623, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4792812466621399, "step": 1749 }, { "completion_length": 130.1875, "epoch": 0.9363295880149812, "grad_norm": 0.766342043876648, "kl": 0.18273955583572388, "learning_rate": 4.345732477493747e-06, "loss": 0.0073, "reward": 2.0153751373291016, "reward_std": 0.7956967949867249, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.499750018119812, "step": 1750 }, { "completion_length": 118.78125, "epoch": 0.936864633493847, "grad_norm": 1.443809986114502, "kl": 0.2390037178993225, "learning_rate": 4.344682307190683e-06, "loss": 0.0096, "reward": 2.4653749465942383, "reward_std": 0.9244235157966614, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48100000619888306, "step": 1751 }, { "completion_length": 118.6875, "epoch": 0.9373996789727127, "grad_norm": 1.1564738750457764, "kl": 0.16062210500240326, "learning_rate": 4.343631421853408e-06, "loss": 0.0064, "reward": 2.233562469482422, "reward_std": 0.7354686856269836, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.45231249928474426, "step": 1752 }, { "completion_length": 133.59375, "epoch": 0.9379347244515784, "grad_norm": 37627332.0, "kl": 165031.890625, "learning_rate": 4.3425798218892644e-06, "loss": 6601.2769, "reward": 2.289875030517578, "reward_std": 0.8163001537322998, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.44612500071525574, "step": 1753 }, { "completion_length": 116.21875, "epoch": 0.9384697699304441, "grad_norm": 19.21531867980957, "kl": 0.5083158016204834, "learning_rate": 4.341527507705874e-06, "loss": 0.0203, "reward": 2.0322186946868896, "reward_std": 0.6668305397033691, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4853437542915344, "step": 1754 }, { "completion_length": 109.28125, "epoch": 0.9390048154093098, "grad_norm": 6.159915447235107, "kl": 0.20481504499912262, "learning_rate": 4.340474479711133e-06, "loss": 0.0082, "reward": 2.520625114440918, "reward_std": 1.0713714361190796, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4893749952316284, "step": 1755 }, { "completion_length": 163.5, "epoch": 0.9395398608881755, "grad_norm": 1.1183980703353882, "kl": 0.16160184144973755, "learning_rate": 4.339420738313215e-06, "loss": 0.0065, "reward": 1.1034687757492065, "reward_std": 0.8387191891670227, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.234375, "rewards/xmlcount_reward_func": 0.33784377574920654, "step": 1756 }, { "completion_length": 144.78125, "epoch": 0.9400749063670412, "grad_norm": 0.5125148296356201, "kl": 0.12317290902137756, "learning_rate": 4.338366283920571e-06, "loss": 0.0049, "reward": 1.9028749465942383, "reward_std": 0.712437629699707, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.44975000619888306, "step": 1757 }, { "completion_length": 141.25, "epoch": 0.9406099518459069, "grad_norm": 0.7577317357063293, "kl": 0.17760246992111206, "learning_rate": 4.337311116941928e-06, "loss": 0.0071, "reward": 2.352687358856201, "reward_std": 0.644402027130127, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4620624780654907, "step": 1758 }, { "completion_length": 132.375, "epoch": 0.9411449973247726, "grad_norm": 0.8034245371818542, "kl": 0.15197345614433289, "learning_rate": 4.336255237786288e-06, "loss": 0.0061, "reward": 1.9543750286102295, "reward_std": 1.0010006427764893, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4543749988079071, "step": 1759 }, { "completion_length": 146.53125, "epoch": 0.9416800428036383, "grad_norm": 0.5474246740341187, "kl": 0.13307088613510132, "learning_rate": 4.335198646862929e-06, "loss": 0.0053, "reward": 1.8352187871932983, "reward_std": 0.9785250425338745, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.44459375739097595, "step": 1760 }, { "completion_length": 113.15625, "epoch": 0.942215088282504, "grad_norm": 0.7911604642868042, "kl": 0.21801380813121796, "learning_rate": 4.334141344581408e-06, "loss": 0.0087, "reward": 2.017218828201294, "reward_std": 0.8499639630317688, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.48596876859664917, "step": 1761 }, { "completion_length": 141.25, "epoch": 0.9427501337613697, "grad_norm": 7283911.5, "kl": 133955.46875, "learning_rate": 4.333083331351557e-06, "loss": 5358.2197, "reward": 1.8236875534057617, "reward_std": 0.9322299957275391, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.43306249380111694, "step": 1762 }, { "completion_length": 154.46875, "epoch": 0.9432851792402355, "grad_norm": 1.0404773950576782, "kl": 0.1676722764968872, "learning_rate": 4.332024607583478e-06, "loss": 0.0067, "reward": 1.7820312976837158, "reward_std": 0.8088974356651306, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.265625, "rewards/xmlcount_reward_func": 0.37578126788139343, "step": 1763 }, { "completion_length": 114.75, "epoch": 0.9438202247191011, "grad_norm": 0.8915718197822571, "kl": 0.16329172253608704, "learning_rate": 4.330965173687555e-06, "loss": 0.0065, "reward": 2.397437572479248, "reward_std": 1.1123089790344238, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.49118751287460327, "step": 1764 }, { "completion_length": 161.78125, "epoch": 0.9443552701979668, "grad_norm": 3.240280866622925, "kl": 0.21950308978557587, "learning_rate": 4.3299050300744445e-06, "loss": 0.0088, "reward": 1.5731250047683716, "reward_std": 0.9720543026924133, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4012500047683716, "step": 1765 }, { "completion_length": 139.9375, "epoch": 0.9448903156768326, "grad_norm": 185251.53125, "kl": 1212.0809326171875, "learning_rate": 4.328844177155079e-06, "loss": 48.4832, "reward": 1.6363437175750732, "reward_std": 0.8524240851402283, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.433218777179718, "step": 1766 }, { "completion_length": 137.1875, "epoch": 0.9454253611556982, "grad_norm": 14.503552436828613, "kl": 0.4953671097755432, "learning_rate": 4.327782615340664e-06, "loss": 0.0198, "reward": 2.377406120300293, "reward_std": 0.8381434082984924, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4399062395095825, "step": 1767 }, { "completion_length": 142.625, "epoch": 0.9459604066345639, "grad_norm": 2.482555866241455, "kl": 0.25108808279037476, "learning_rate": 4.326720345042684e-06, "loss": 0.01, "reward": 1.6001250743865967, "reward_std": 0.7400832772254944, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.3970000147819519, "step": 1768 }, { "completion_length": 151.625, "epoch": 0.9464954521134297, "grad_norm": 0.7739148736000061, "kl": 0.14502575993537903, "learning_rate": 4.325657366672892e-06, "loss": 0.0058, "reward": 1.5076249837875366, "reward_std": 0.9947701692581177, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.382625013589859, "step": 1769 }, { "completion_length": 136.5, "epoch": 0.9470304975922953, "grad_norm": 12774.685546875, "kl": 45.48685073852539, "learning_rate": 4.324593680643321e-06, "loss": 1.8195, "reward": 1.8622500896453857, "reward_std": 0.5241824984550476, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.3622500002384186, "step": 1770 }, { "completion_length": 149.375, "epoch": 0.947565543071161, "grad_norm": 2.1660959720611572, "kl": 0.16462969779968262, "learning_rate": 4.323529287366275e-06, "loss": 0.0066, "reward": 1.6047812700271606, "reward_std": 0.7920072674751282, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.46415627002716064, "step": 1771 }, { "completion_length": 147.53125, "epoch": 0.9481005885500268, "grad_norm": 1.4998259544372559, "kl": 0.15024057030677795, "learning_rate": 4.322464187254335e-06, "loss": 0.006, "reward": 1.9005000591278076, "reward_std": 1.0103940963745117, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.44737499952316284, "step": 1772 }, { "completion_length": 135.5625, "epoch": 0.9486356340288925, "grad_norm": 0.8505929112434387, "kl": 0.20185256004333496, "learning_rate": 4.321398380720351e-06, "loss": 0.0081, "reward": 2.029031276702881, "reward_std": 0.9442219734191895, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.46653124690055847, "step": 1773 }, { "completion_length": 125.28125, "epoch": 0.9491706795077581, "grad_norm": 1.6782907247543335, "kl": 0.22689415514469147, "learning_rate": 4.3203318681774525e-06, "loss": 0.0091, "reward": 1.923781156539917, "reward_std": 0.7496955990791321, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47065627574920654, "step": 1774 }, { "completion_length": 137.65625, "epoch": 0.9497057249866239, "grad_norm": 2.2550106048583984, "kl": 0.16053591668605804, "learning_rate": 4.31926465003904e-06, "loss": 0.0064, "reward": 1.8494999408721924, "reward_std": 1.0324666500091553, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.44325000047683716, "step": 1775 }, { "completion_length": 156.53125, "epoch": 0.9502407704654896, "grad_norm": 0.812242329120636, "kl": 0.16035744547843933, "learning_rate": 4.318196726718787e-06, "loss": 0.0064, "reward": 1.1944999694824219, "reward_std": 0.9354825615882874, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.39762499928474426, "step": 1776 }, { "completion_length": 160.0, "epoch": 0.9507758159443552, "grad_norm": 0.605988085269928, "kl": 0.11516206711530685, "learning_rate": 4.31712809863064e-06, "loss": 0.0046, "reward": 1.7821874618530273, "reward_std": 0.9949606657028198, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4071875214576721, "step": 1777 }, { "completion_length": 131.8125, "epoch": 0.951310861423221, "grad_norm": 1.459506869316101, "kl": 0.29212474822998047, "learning_rate": 4.316058766188823e-06, "loss": 0.0117, "reward": 1.7807812690734863, "reward_std": 0.6807236671447754, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.28125, "rewards/xmlcount_reward_func": 0.39015623927116394, "step": 1778 }, { "completion_length": 118.75, "epoch": 0.9518459069020867, "grad_norm": 85184946176.0, "kl": 995810048.0, "learning_rate": 4.3149887298078275e-06, "loss": 39832400.0, "reward": 1.9167811870574951, "reward_std": 0.7017369866371155, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.4324062466621399, "step": 1779 }, { "completion_length": 164.0625, "epoch": 0.9523809523809523, "grad_norm": 0.3887767791748047, "kl": 0.10854979604482651, "learning_rate": 4.31391798990242e-06, "loss": 0.0043, "reward": 1.6148749589920044, "reward_std": 0.8427173495292664, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.3961249887943268, "step": 1780 }, { "completion_length": 110.75, "epoch": 0.9529159978598181, "grad_norm": 0.9981739521026611, "kl": 0.18920917809009552, "learning_rate": 4.31284654688764e-06, "loss": 0.0076, "reward": 2.200343608856201, "reward_std": 0.9812730550765991, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4815937578678131, "step": 1781 }, { "completion_length": 118.5, "epoch": 0.9534510433386838, "grad_norm": 7486.81640625, "kl": 311.1668701171875, "learning_rate": 4.3117744011788e-06, "loss": 12.4467, "reward": 2.2003438472747803, "reward_std": 0.6298859119415283, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4659687578678131, "step": 1782 }, { "completion_length": 150.21875, "epoch": 0.9539860888175495, "grad_norm": 1.5031914710998535, "kl": 0.17062385380268097, "learning_rate": 4.310701553191485e-06, "loss": 0.0068, "reward": 1.3524374961853027, "reward_std": 0.8612300157546997, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.234375, "rewards/xmlcount_reward_func": 0.33681249618530273, "step": 1783 }, { "completion_length": 125.84375, "epoch": 0.9545211342964152, "grad_norm": 1.0126053094863892, "kl": 0.20391827821731567, "learning_rate": 4.309628003341547e-06, "loss": 0.0082, "reward": 2.5894999504089355, "reward_std": 0.7906216382980347, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4801250100135803, "step": 1784 }, { "completion_length": 126.84375, "epoch": 0.9550561797752809, "grad_norm": 0.870381236076355, "kl": 0.1541869193315506, "learning_rate": 4.308553752045119e-06, "loss": 0.0062, "reward": 1.6640625, "reward_std": 0.5066734552383423, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4765625, "step": 1785 }, { "completion_length": 113.53125, "epoch": 0.9555912252541466, "grad_norm": 0.8297276496887207, "kl": 0.22935929894447327, "learning_rate": 4.307478799718602e-06, "loss": 0.0092, "reward": 1.828781247138977, "reward_std": 0.5238828659057617, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48503124713897705, "step": 1786 }, { "completion_length": 149.09375, "epoch": 0.9561262707330123, "grad_norm": 1.0869271755218506, "kl": 0.18790745735168457, "learning_rate": 4.306403146778664e-06, "loss": 0.0075, "reward": 2.0199999809265137, "reward_std": 0.6267905235290527, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.45750001072883606, "step": 1787 }, { "completion_length": 161.84375, "epoch": 0.956661316211878, "grad_norm": 1.2862271070480347, "kl": 0.1663682758808136, "learning_rate": 4.305326793642252e-06, "loss": 0.0067, "reward": 1.4589061737060547, "reward_std": 0.9458886384963989, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.41203123331069946, "step": 1788 }, { "completion_length": 112.0, "epoch": 0.9571963616907437, "grad_norm": 0.6455591320991516, "kl": 0.1824299395084381, "learning_rate": 4.30424974072658e-06, "loss": 0.0073, "reward": 1.9217500686645508, "reward_std": 0.6344706416130066, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.499875009059906, "step": 1789 }, { "completion_length": 135.40625, "epoch": 0.9577314071696095, "grad_norm": 3.0921308994293213, "kl": 0.22413226962089539, "learning_rate": 4.303171988449134e-06, "loss": 0.009, "reward": 2.3061561584472656, "reward_std": 0.9670370817184448, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4311562478542328, "step": 1790 }, { "completion_length": 127.40625, "epoch": 0.9582664526484751, "grad_norm": 1.3856581449508667, "kl": 0.20649202167987823, "learning_rate": 4.302093537227672e-06, "loss": 0.0083, "reward": 2.1166250705718994, "reward_std": 0.6033762693405151, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.49162501096725464, "step": 1791 }, { "completion_length": 143.0, "epoch": 0.9588014981273408, "grad_norm": 1.3217743635177612, "kl": 0.19494423270225525, "learning_rate": 4.301014387480221e-06, "loss": 0.0078, "reward": 1.1930313110351562, "reward_std": 0.42098626494407654, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4586562514305115, "step": 1792 }, { "completion_length": 138.84375, "epoch": 0.9593365436062066, "grad_norm": 0.4897870719432831, "kl": 0.15856224298477173, "learning_rate": 4.2999345396250825e-06, "loss": 0.0063, "reward": 2.2084999084472656, "reward_std": 0.959031879901886, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4585000276565552, "step": 1793 }, { "completion_length": 101.4375, "epoch": 0.9598715890850722, "grad_norm": 47.2495002746582, "kl": 0.564161479473114, "learning_rate": 4.298853994080825e-06, "loss": 0.0226, "reward": 2.906125068664551, "reward_std": 0.22053346037864685, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.499875009059906, "step": 1794 }, { "completion_length": 117.53125, "epoch": 0.9604066345639379, "grad_norm": 0.8218206167221069, "kl": 0.17118775844573975, "learning_rate": 4.297772751266288e-06, "loss": 0.0068, "reward": 2.7849373817443848, "reward_std": 0.4198360741138458, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4724375009536743, "step": 1795 }, { "completion_length": 128.59375, "epoch": 0.9609416800428037, "grad_norm": 1.206085205078125, "kl": 0.24296100437641144, "learning_rate": 4.296690811600583e-06, "loss": 0.0097, "reward": 1.8691250085830688, "reward_std": 0.9443482756614685, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.44725000858306885, "step": 1796 }, { "completion_length": 128.21875, "epoch": 0.9614767255216693, "grad_norm": 1.684972882270813, "kl": 0.21951979398727417, "learning_rate": 4.295608175503091e-06, "loss": 0.0088, "reward": 2.593562602996826, "reward_std": 1.0805655717849731, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4841874837875366, "step": 1797 }, { "completion_length": 116.40625, "epoch": 0.962011771000535, "grad_norm": 0.7345799207687378, "kl": 0.20257118344306946, "learning_rate": 4.294524843393461e-06, "loss": 0.0081, "reward": 2.2599375247955322, "reward_std": 0.5263763070106506, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.49431249499320984, "step": 1798 }, { "completion_length": 140.4375, "epoch": 0.9625468164794008, "grad_norm": 2.2249162197113037, "kl": 0.19241321086883545, "learning_rate": 4.293440815691613e-06, "loss": 0.0077, "reward": 1.8246562480926514, "reward_std": 0.48759910464286804, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.43403124809265137, "step": 1799 }, { "completion_length": 144.34375, "epoch": 0.9630818619582665, "grad_norm": 2.160519599914551, "kl": 0.3880872428417206, "learning_rate": 4.2923560928177385e-06, "loss": 0.0155, "reward": 2.2391562461853027, "reward_std": 0.8246207237243652, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.44228124618530273, "step": 1800 }, { "completion_length": 139.78125, "epoch": 0.9636169074371321, "grad_norm": 0.6131810545921326, "kl": 0.15727895498275757, "learning_rate": 4.291270675192296e-06, "loss": 0.0063, "reward": 1.2489687204360962, "reward_std": 0.5329717397689819, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4364687502384186, "step": 1801 }, { "completion_length": 144.28125, "epoch": 0.9641519529159979, "grad_norm": 2.105043649673462, "kl": 0.1962747871875763, "learning_rate": 4.290184563236015e-06, "loss": 0.0079, "reward": 1.5593750476837158, "reward_std": 0.9035195112228394, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.41874998807907104, "step": 1802 }, { "completion_length": 160.25, "epoch": 0.9646869983948636, "grad_norm": 0.9007627964019775, "kl": 0.11176802217960358, "learning_rate": 4.289097757369892e-06, "loss": 0.0045, "reward": 1.701968789100647, "reward_std": 1.047478437423706, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4519687592983246, "step": 1803 }, { "completion_length": 133.15625, "epoch": 0.9652220438737292, "grad_norm": 0.6008546948432922, "kl": 0.16530826687812805, "learning_rate": 4.288010258015194e-06, "loss": 0.0066, "reward": 1.4222187995910645, "reward_std": 0.5450422763824463, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4534687399864197, "step": 1804 }, { "completion_length": 154.1875, "epoch": 0.965757089352595, "grad_norm": 0.6569623947143555, "kl": 0.2004370093345642, "learning_rate": 4.286922065593459e-06, "loss": 0.008, "reward": 1.8460311889648438, "reward_std": 0.7983023524284363, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.4241562783718109, "step": 1805 }, { "completion_length": 114.53125, "epoch": 0.9662921348314607, "grad_norm": 15.077983856201172, "kl": 0.3197469711303711, "learning_rate": 4.285833180526487e-06, "loss": 0.0128, "reward": 1.6402499675750732, "reward_std": 0.5111061334609985, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.49962499737739563, "step": 1806 }, { "completion_length": 123.5, "epoch": 0.9668271803103263, "grad_norm": 1.4863815307617188, "kl": 0.1981935054063797, "learning_rate": 4.284743603236354e-06, "loss": 0.0079, "reward": 1.853156328201294, "reward_std": 0.6635421514511108, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.47815626859664917, "step": 1807 }, { "completion_length": 130.875, "epoch": 0.9673622257891921, "grad_norm": 0.5969281196594238, "kl": 0.18443229794502258, "learning_rate": 4.2836533341454005e-06, "loss": 0.0074, "reward": 2.2491250038146973, "reward_std": 0.6380741596221924, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.45225000381469727, "step": 1808 }, { "completion_length": 111.6875, "epoch": 0.9678972712680578, "grad_norm": 1.1602946519851685, "kl": 0.18341398239135742, "learning_rate": 4.282562373676235e-06, "loss": 0.0073, "reward": 2.4259376525878906, "reward_std": 1.0443980693817139, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4884374737739563, "step": 1809 }, { "completion_length": 144.625, "epoch": 0.9684323167469235, "grad_norm": 0.8623385429382324, "kl": 0.23028716444969177, "learning_rate": 4.281470722251734e-06, "loss": 0.0092, "reward": 1.6421250104904175, "reward_std": 0.8725264072418213, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4389999806880951, "step": 1810 }, { "completion_length": 134.9375, "epoch": 0.9689673622257892, "grad_norm": 0.7015960216522217, "kl": 0.15229842066764832, "learning_rate": 4.280378380295044e-06, "loss": 0.0061, "reward": 1.961531162261963, "reward_std": 0.7455821633338928, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.39903125166893005, "step": 1811 }, { "completion_length": 135.09375, "epoch": 0.9695024077046549, "grad_norm": 20.309368133544922, "kl": 2.8031671047210693, "learning_rate": 4.279285348229577e-06, "loss": 0.1121, "reward": 1.606874942779541, "reward_std": 0.623102068901062, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4662500023841858, "step": 1812 }, { "completion_length": 128.71875, "epoch": 0.9700374531835206, "grad_norm": 0.6087217926979065, "kl": 0.1740274727344513, "learning_rate": 4.278191626479014e-06, "loss": 0.007, "reward": 2.904531240463257, "reward_std": 0.7818965911865234, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.45140624046325684, "step": 1813 }, { "completion_length": 133.6875, "epoch": 0.9705724986623863, "grad_norm": 1.409761667251587, "kl": 0.14838001132011414, "learning_rate": 4.277097215467301e-06, "loss": 0.0059, "reward": 2.0452187061309814, "reward_std": 0.7194041013717651, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4670937657356262, "step": 1814 }, { "completion_length": 119.40625, "epoch": 0.971107544141252, "grad_norm": 3.7082948684692383, "kl": 0.5564590692520142, "learning_rate": 4.276002115618654e-06, "loss": 0.0223, "reward": 2.1931562423706055, "reward_std": 0.7606329321861267, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.45878124237060547, "step": 1815 }, { "completion_length": 134.03125, "epoch": 0.9716425896201177, "grad_norm": 0.9180464744567871, "kl": 0.14860448241233826, "learning_rate": 4.274906327357554e-06, "loss": 0.0059, "reward": 1.888312578201294, "reward_std": 0.8672767281532288, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.28125, "rewards/xmlcount_reward_func": 0.4351874887943268, "step": 1816 }, { "completion_length": 145.03125, "epoch": 0.9721776350989834, "grad_norm": 0.8414563536643982, "kl": 0.14915987849235535, "learning_rate": 4.273809851108749e-06, "loss": 0.006, "reward": 1.9182188510894775, "reward_std": 0.9825780391693115, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4338437616825104, "step": 1817 }, { "completion_length": 124.09375, "epoch": 0.9727126805778491, "grad_norm": 467327904.0, "kl": 33558556.0, "learning_rate": 4.272712687297256e-06, "loss": 1342342.125, "reward": 2.445187568664551, "reward_std": 1.068888545036316, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.492062509059906, "step": 1818 }, { "completion_length": 116.8125, "epoch": 0.9732477260567148, "grad_norm": 1.3487114906311035, "kl": 0.22719278931617737, "learning_rate": 4.271614836348354e-06, "loss": 0.0091, "reward": 1.824906349182129, "reward_std": 0.8064679503440857, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.41865625977516174, "step": 1819 }, { "completion_length": 116.125, "epoch": 0.9737827715355806, "grad_norm": 0.8971264362335205, "kl": 0.18527904152870178, "learning_rate": 4.270516298687594e-06, "loss": 0.0074, "reward": 2.8803436756134033, "reward_std": 0.7151044607162476, "rewards/correctness_reward_func": 1.5625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4584687352180481, "step": 1820 }, { "completion_length": 161.8125, "epoch": 0.9743178170144462, "grad_norm": 0.6226778030395508, "kl": 0.15061479806900024, "learning_rate": 4.269417074740789e-06, "loss": 0.006, "reward": 1.316562533378601, "reward_std": 0.9870871305465698, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.3946875035762787, "step": 1821 }, { "completion_length": 139.5625, "epoch": 0.9748528624933119, "grad_norm": 0.6007213592529297, "kl": 0.17488375306129456, "learning_rate": 4.2683171649340184e-06, "loss": 0.007, "reward": 1.8961563110351562, "reward_std": 0.7127403616905212, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4742812514305115, "step": 1822 }, { "completion_length": 125.90625, "epoch": 0.9753879079721777, "grad_norm": 233901.703125, "kl": 12429.7734375, "learning_rate": 4.26721656969363e-06, "loss": 497.191, "reward": 1.7120624780654907, "reward_std": 0.479485422372818, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4776875078678131, "step": 1823 }, { "completion_length": 131.59375, "epoch": 0.9759229534510433, "grad_norm": 2.786264657974243, "kl": 0.18614278733730316, "learning_rate": 4.2661152894462335e-06, "loss": 0.0074, "reward": 1.1781562566757202, "reward_std": 0.57305908203125, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.265625, "rewards/xmlcount_reward_func": 0.4125312566757202, "step": 1824 }, { "completion_length": 146.78125, "epoch": 0.976457998929909, "grad_norm": 2.0599679946899414, "kl": 0.14059102535247803, "learning_rate": 4.265013324618708e-06, "loss": 0.0056, "reward": 1.5618125200271606, "reward_std": 0.8634971976280212, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.38993752002716064, "step": 1825 }, { "completion_length": 111.65625, "epoch": 0.9769930444087748, "grad_norm": 0.7174727916717529, "kl": 0.17344264686107635, "learning_rate": 4.263910675638195e-06, "loss": 0.0069, "reward": 2.9448750019073486, "reward_std": 0.8733412623405457, "rewards/correctness_reward_func": 1.5625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.49175000190734863, "step": 1826 }, { "completion_length": 129.625, "epoch": 0.9775280898876404, "grad_norm": 1.8836461305618286, "kl": 0.23122835159301758, "learning_rate": 4.2628073429321045e-06, "loss": 0.0092, "reward": 1.8616249561309814, "reward_std": 0.590540885925293, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4553750157356262, "step": 1827 }, { "completion_length": 139.6875, "epoch": 0.9780631353665061, "grad_norm": 0.6075541377067566, "kl": 0.14617140591144562, "learning_rate": 4.2617033269281065e-06, "loss": 0.0058, "reward": 2.085812568664551, "reward_std": 1.0250346660614014, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.476437509059906, "step": 1828 }, { "completion_length": 117.3125, "epoch": 0.9785981808453719, "grad_norm": 1.3211040496826172, "kl": 0.23922160267829895, "learning_rate": 4.2605986280541415e-06, "loss": 0.0096, "reward": 2.0875000953674316, "reward_std": 0.6871401071548462, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4937500059604645, "step": 1829 }, { "completion_length": 121.34375, "epoch": 0.9791332263242376, "grad_norm": 3.795337200164795, "kl": 0.3509388864040375, "learning_rate": 4.259493246738409e-06, "loss": 0.014, "reward": 2.24428129196167, "reward_std": 0.639340341091156, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.44740623235702515, "step": 1830 }, { "completion_length": 161.34375, "epoch": 0.9796682718031032, "grad_norm": 0.6991567611694336, "kl": 0.12361042946577072, "learning_rate": 4.258387183409379e-06, "loss": 0.0049, "reward": 1.4495313167572021, "reward_std": 0.5461598038673401, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4495312571525574, "step": 1831 }, { "completion_length": 145.125, "epoch": 0.980203317281969, "grad_norm": 4.440304756164551, "kl": 0.34301212430000305, "learning_rate": 4.25728043849578e-06, "loss": 0.0137, "reward": 1.82421875, "reward_std": 0.27680087089538574, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.48046875, "step": 1832 }, { "completion_length": 126.375, "epoch": 0.9807383627608347, "grad_norm": 120.8355484008789, "kl": 0.44087696075439453, "learning_rate": 4.25617301242661e-06, "loss": 0.0176, "reward": 2.7008748054504395, "reward_std": 0.7899216413497925, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.46650001406669617, "step": 1833 }, { "completion_length": 117.9375, "epoch": 0.9812734082397003, "grad_norm": 0.9184818863868713, "kl": 0.16214808821678162, "learning_rate": 4.255064905631127e-06, "loss": 0.0065, "reward": 2.491000175476074, "reward_std": 0.6533023118972778, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4753749966621399, "step": 1834 }, { "completion_length": 107.46875, "epoch": 0.9818084537185661, "grad_norm": 0.7405413389205933, "kl": 0.20643401145935059, "learning_rate": 4.253956118538856e-06, "loss": 0.0083, "reward": 3.28125, "reward_std": 0.4505236744880676, "rewards/correctness_reward_func": 1.8125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.5, "step": 1835 }, { "completion_length": 136.09375, "epoch": 0.9823434991974318, "grad_norm": 1.3932803869247437, "kl": 0.1763356775045395, "learning_rate": 4.2528466515795815e-06, "loss": 0.0071, "reward": 0.9932812452316284, "reward_std": 0.6369936466217041, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.1875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.3839062452316284, "step": 1836 }, { "completion_length": 136.78125, "epoch": 0.9828785446762975, "grad_norm": 0.7436097264289856, "kl": 0.1759924739599228, "learning_rate": 4.2517365051833564e-06, "loss": 0.007, "reward": 1.9042500257492065, "reward_std": 0.7593050003051758, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.28125, "rewards/xmlcount_reward_func": 0.40424999594688416, "step": 1837 }, { "completion_length": 156.375, "epoch": 0.9834135901551632, "grad_norm": 0.8075975775718689, "kl": 0.16961519420146942, "learning_rate": 4.250625679780494e-06, "loss": 0.0068, "reward": 1.7325313091278076, "reward_std": 0.829848051071167, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.42003124952316284, "step": 1838 }, { "completion_length": 147.90625, "epoch": 0.9839486356340289, "grad_norm": 328.8703308105469, "kl": 1.812641978263855, "learning_rate": 4.249514175801572e-06, "loss": 0.0725, "reward": 2.073406219482422, "reward_std": 0.8313570022583008, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.43278124928474426, "step": 1839 }, { "completion_length": 117.40625, "epoch": 0.9844836811128946, "grad_norm": 1.7281930446624756, "kl": 0.30636563897132874, "learning_rate": 4.2484019936774305e-06, "loss": 0.0123, "reward": 1.794562578201294, "reward_std": 0.7389140129089355, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4351874887943268, "step": 1840 }, { "completion_length": 119.84375, "epoch": 0.9850187265917603, "grad_norm": 0.7670500874519348, "kl": 0.18957111239433289, "learning_rate": 4.247289133839171e-06, "loss": 0.0076, "reward": 2.0528438091278076, "reward_std": 0.8660012483596802, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.47471874952316284, "step": 1841 }, { "completion_length": 134.40625, "epoch": 0.985553772070626, "grad_norm": 0.39108625054359436, "kl": 0.1317313313484192, "learning_rate": 4.246175596718161e-06, "loss": 0.0053, "reward": 2.1796875, "reward_std": 0.5644975900650024, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4765625, "step": 1842 }, { "completion_length": 141.28125, "epoch": 0.9860888175494917, "grad_norm": 1.5172892808914185, "kl": 0.18897873163223267, "learning_rate": 4.245061382746029e-06, "loss": 0.0076, "reward": 1.4870312213897705, "reward_std": 0.8069103956222534, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.3776562511920929, "step": 1843 }, { "completion_length": 117.75, "epoch": 0.9866238630283574, "grad_norm": 0.8979672193527222, "kl": 0.24812959134578705, "learning_rate": 4.243946492354664e-06, "loss": 0.0099, "reward": 2.483187437057495, "reward_std": 0.8093276023864746, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4988124966621399, "step": 1844 }, { "completion_length": 100.90625, "epoch": 0.9871589085072231, "grad_norm": 2.421661853790283, "kl": 0.37479302287101746, "learning_rate": 4.242830925976221e-06, "loss": 0.015, "reward": 2.5625, "reward_std": 0.9778778553009033, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.5, "step": 1845 }, { "completion_length": 129.125, "epoch": 0.9876939539860888, "grad_norm": 3.727755546569824, "kl": 0.27632737159729004, "learning_rate": 4.241714684043115e-06, "loss": 0.0111, "reward": 1.7864373922348022, "reward_std": 0.5426410436630249, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4426874816417694, "step": 1846 }, { "completion_length": 127.59375, "epoch": 0.9882289994649546, "grad_norm": 0.6490991711616516, "kl": 0.16460981965065002, "learning_rate": 4.240597766988019e-06, "loss": 0.0066, "reward": 2.0641562938690186, "reward_std": 0.6832209825515747, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4704062342643738, "step": 1847 }, { "completion_length": 123.6875, "epoch": 0.9887640449438202, "grad_norm": 2.284445285797119, "kl": 0.32611292600631714, "learning_rate": 4.239480175243876e-06, "loss": 0.013, "reward": 2.0464375019073486, "reward_std": 0.592338502407074, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.49956250190734863, "step": 1848 }, { "completion_length": 126.0625, "epoch": 0.9892990904226859, "grad_norm": 0.6783219575881958, "kl": 0.17237231135368347, "learning_rate": 4.238361909243883e-06, "loss": 0.0069, "reward": 1.7959063053131104, "reward_std": 0.873412549495697, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4521562457084656, "step": 1849 }, { "completion_length": 108.21875, "epoch": 0.9898341359015517, "grad_norm": 0.7365827560424805, "kl": 0.20600822567939758, "learning_rate": 4.237242969421503e-06, "loss": 0.0082, "reward": 2.3125, "reward_std": 0.5737333297729492, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.5, "step": 1850 }, { "completion_length": 132.53125, "epoch": 0.9903691813804173, "grad_norm": 3.128035068511963, "kl": 0.5582296252250671, "learning_rate": 4.2361233562104585e-06, "loss": 0.0223, "reward": 1.8869374990463257, "reward_std": 0.8100188374519348, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4338124990463257, "step": 1851 }, { "completion_length": 120.625, "epoch": 0.990904226859283, "grad_norm": 0.452317476272583, "kl": 0.15635041892528534, "learning_rate": 4.235003070044731e-06, "loss": 0.0063, "reward": 2.204718828201294, "reward_std": 0.8743663430213928, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.47034376859664917, "step": 1852 }, { "completion_length": 102.75, "epoch": 0.9914392723381488, "grad_norm": 0.6858958601951599, "kl": 0.20749887824058533, "learning_rate": 4.233882111358568e-06, "loss": 0.0083, "reward": 2.694531202316284, "reward_std": 0.8024338483810425, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.49140626192092896, "step": 1853 }, { "completion_length": 111.375, "epoch": 0.9919743178170144, "grad_norm": 0.9728786945343018, "kl": 0.21879063546657562, "learning_rate": 4.232760480586472e-06, "loss": 0.0088, "reward": 2.464531183242798, "reward_std": 0.5015960335731506, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.4957812428474426, "step": 1854 }, { "completion_length": 125.78125, "epoch": 0.9925093632958801, "grad_norm": 3.3465576171875, "kl": 0.2760304808616638, "learning_rate": 4.23163817816321e-06, "loss": 0.011, "reward": 2.129625082015991, "reward_std": 0.7916821837425232, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.45774999260902405, "step": 1855 }, { "completion_length": 145.625, "epoch": 0.9930444087747459, "grad_norm": 0.9536641836166382, "kl": 0.23010335862636566, "learning_rate": 4.230515204523807e-06, "loss": 0.0092, "reward": 1.9470938444137573, "reward_std": 1.1245629787445068, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.44709375500679016, "step": 1856 }, { "completion_length": 103.75, "epoch": 0.9935794542536116, "grad_norm": 4.183147430419922, "kl": 0.2385844737291336, "learning_rate": 4.229391560103549e-06, "loss": 0.0095, "reward": 2.6605937480926514, "reward_std": 0.687746524810791, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48871874809265137, "step": 1857 }, { "completion_length": 139.78125, "epoch": 0.9941144997324772, "grad_norm": 1.731711506843567, "kl": 0.24763736128807068, "learning_rate": 4.228267245337983e-06, "loss": 0.0099, "reward": 1.907843828201294, "reward_std": 0.7780975103378296, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4547187387943268, "step": 1858 }, { "completion_length": 158.71875, "epoch": 0.994649545211343, "grad_norm": 0.6335492730140686, "kl": 0.1479901373386383, "learning_rate": 4.227142260662915e-06, "loss": 0.0059, "reward": 1.320812463760376, "reward_std": 0.7718144059181213, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.38331249356269836, "step": 1859 }, { "completion_length": 153.90625, "epoch": 0.9951845906902087, "grad_norm": 1.3960403203964233, "kl": 0.13733863830566406, "learning_rate": 4.226016606514411e-06, "loss": 0.0055, "reward": 1.406406283378601, "reward_std": 0.9162248373031616, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.28125, "rewards/xmlcount_reward_func": 0.3595312237739563, "step": 1860 }, { "completion_length": 136.5625, "epoch": 0.9957196361690743, "grad_norm": 1.0012180805206299, "kl": 0.1743142306804657, "learning_rate": 4.224890283328794e-06, "loss": 0.007, "reward": 1.9835937023162842, "reward_std": 0.6720241904258728, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48359376192092896, "step": 1861 }, { "completion_length": 136.59375, "epoch": 0.9962546816479401, "grad_norm": 1.3044617176055908, "kl": 0.17949865758419037, "learning_rate": 4.22376329154265e-06, "loss": 0.0072, "reward": 2.0924999713897705, "reward_std": 1.0426878929138184, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4675000011920929, "step": 1862 }, { "completion_length": 168.71875, "epoch": 0.9967897271268058, "grad_norm": 0.7573160529136658, "kl": 0.12154608964920044, "learning_rate": 4.222635631592823e-06, "loss": 0.0049, "reward": 1.1711561679840088, "reward_std": 0.7201180458068848, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.171875, "rewards/xmlcount_reward_func": 0.31178125739097595, "step": 1863 }, { "completion_length": 142.34375, "epoch": 0.9973247726056714, "grad_norm": 10.606563568115234, "kl": 1.1635801792144775, "learning_rate": 4.221507303916414e-06, "loss": 0.0465, "reward": 1.3781249523162842, "reward_std": 0.5860216617584229, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.40937501192092896, "step": 1864 }, { "completion_length": 132.375, "epoch": 0.9978598180845372, "grad_norm": 0.8223066329956055, "kl": 0.19768047332763672, "learning_rate": 4.220378308950787e-06, "loss": 0.0079, "reward": 2.224562644958496, "reward_std": 0.6913583278656006, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.45893749594688416, "step": 1865 }, { "completion_length": 125.3125, "epoch": 0.9983948635634029, "grad_norm": 1.1116673946380615, "kl": 0.16213105618953705, "learning_rate": 4.219248647133559e-06, "loss": 0.0065, "reward": 2.2300000190734863, "reward_std": 0.8066980838775635, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.44874998927116394, "step": 1866 }, { "completion_length": 115.78125, "epoch": 0.9989299090422686, "grad_norm": 0.44593751430511475, "kl": 0.14850673079490662, "learning_rate": 4.218118318902609e-06, "loss": 0.0059, "reward": 2.373406171798706, "reward_std": 0.4847274422645569, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.48278123140335083, "step": 1867 }, { "completion_length": 134.125, "epoch": 0.9994649545211343, "grad_norm": 56.176692962646484, "kl": 0.46300262212753296, "learning_rate": 4.216987324696078e-06, "loss": 0.0185, "reward": 2.1644062995910645, "reward_std": 0.9145255088806152, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4456562399864197, "step": 1868 }, { "completion_length": 188.0, "epoch": 1.0, "grad_norm": 0.9185136556625366, "kl": 0.14151716232299805, "learning_rate": 4.215855664952356e-06, "loss": 0.0057, "reward": 1.0, "reward_std": 1.0144323110580444, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.5, "step": 1869 }, { "completion_length": 94.9375, "epoch": 1.0005350454788657, "grad_norm": 0.7978875041007996, "kl": 0.1862207055091858, "learning_rate": 4.214723340110098e-06, "loss": 0.0074, "reward": 2.737968683242798, "reward_std": 0.3664216697216034, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4723437428474426, "step": 1870 }, { "completion_length": 114.21875, "epoch": 1.0010700909577315, "grad_norm": 2.9410932064056396, "kl": 0.16872546076774597, "learning_rate": 4.213590350608215e-06, "loss": 0.0067, "reward": 2.365187644958496, "reward_std": 1.0578118562698364, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.47456249594688416, "step": 1871 }, { "completion_length": 130.78125, "epoch": 1.0016051364365972, "grad_norm": 0.8221540451049805, "kl": 0.19142813980579376, "learning_rate": 4.212456696885876e-06, "loss": 0.0077, "reward": 2.1580312252044678, "reward_std": 0.9054326415061951, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.43928125500679016, "step": 1872 }, { "completion_length": 128.5625, "epoch": 1.0021401819154627, "grad_norm": 0.8785712718963623, "kl": 0.15089961886405945, "learning_rate": 4.2113223793825055e-06, "loss": 0.006, "reward": 1.2952499389648438, "reward_std": 0.29430848360061646, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4671249985694885, "step": 1873 }, { "completion_length": 141.71875, "epoch": 1.0026752273943285, "grad_norm": 11311549440.0, "kl": 52030032.0, "learning_rate": 4.210187398537788e-06, "loss": 2081201.125, "reward": 2.019343852996826, "reward_std": 0.9329698085784912, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.409968763589859, "step": 1874 }, { "completion_length": 121.125, "epoch": 1.0032102728731942, "grad_norm": 5.576756954193115, "kl": 0.5326619744300842, "learning_rate": 4.209051754791662e-06, "loss": 0.0213, "reward": 2.1914374828338623, "reward_std": 0.6617318391799927, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.4414375126361847, "step": 1875 }, { "completion_length": 149.46875, "epoch": 1.00374531835206, "grad_norm": 1.9817917346954346, "kl": 0.15275096893310547, "learning_rate": 4.2079154485843275e-06, "loss": 0.0061, "reward": 1.49818754196167, "reward_std": 0.7335731983184814, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.28125, "rewards/xmlcount_reward_func": 0.37318748235702515, "step": 1876 }, { "completion_length": 120.40625, "epoch": 1.0042803638309257, "grad_norm": 0.5637605786323547, "kl": 0.18145698308944702, "learning_rate": 4.206778480356236e-06, "loss": 0.0073, "reward": 2.355562448501587, "reward_std": 0.900492250919342, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4649375081062317, "step": 1877 }, { "completion_length": 123.03125, "epoch": 1.0048154093097914, "grad_norm": 0.7672953605651855, "kl": 0.20168757438659668, "learning_rate": 4.2056408505480995e-06, "loss": 0.0081, "reward": 2.369406223297119, "reward_std": 0.9146740436553955, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.46315622329711914, "step": 1878 }, { "completion_length": 133.15625, "epoch": 1.0053504547886571, "grad_norm": 1.0667250156402588, "kl": 0.18955561518669128, "learning_rate": 4.204502559600884e-06, "loss": 0.0076, "reward": 1.4852187633514404, "reward_std": 0.7999729514122009, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.46959376335144043, "step": 1879 }, { "completion_length": 122.46875, "epoch": 1.0058855002675227, "grad_norm": 0.7968709468841553, "kl": 0.16206121444702148, "learning_rate": 4.203363607955814e-06, "loss": 0.0065, "reward": 2.449312448501587, "reward_std": 0.7980909943580627, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4805625081062317, "step": 1880 }, { "completion_length": 122.46875, "epoch": 1.0064205457463884, "grad_norm": 0.8906788229942322, "kl": 0.19994693994522095, "learning_rate": 4.202223996054369e-06, "loss": 0.008, "reward": 1.8028124570846558, "reward_std": 0.9606313705444336, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.44343751668930054, "step": 1881 }, { "completion_length": 134.25, "epoch": 1.0069555912252541, "grad_norm": 1.541482925415039, "kl": 0.16625331342220306, "learning_rate": 4.201083724338284e-06, "loss": 0.0067, "reward": 1.7406562566757202, "reward_std": 0.8969035148620605, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4594062566757202, "step": 1882 }, { "completion_length": 113.125, "epoch": 1.0074906367041199, "grad_norm": 0.9777933955192566, "kl": 0.17304953932762146, "learning_rate": 4.199942793249551e-06, "loss": 0.0069, "reward": 2.274625062942505, "reward_std": 0.941665768623352, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4933750033378601, "step": 1883 }, { "completion_length": 144.8125, "epoch": 1.0080256821829856, "grad_norm": 1.0036334991455078, "kl": 0.17775577306747437, "learning_rate": 4.198801203230415e-06, "loss": 0.0071, "reward": 1.9443438053131104, "reward_std": 1.0535262823104858, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.42871877551078796, "step": 1884 }, { "completion_length": 123.1875, "epoch": 1.0085607276618513, "grad_norm": 1.0136218070983887, "kl": 0.14647826552391052, "learning_rate": 4.197658954723379e-06, "loss": 0.0059, "reward": 2.341437578201294, "reward_std": 1.0801634788513184, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.43518751859664917, "step": 1885 }, { "completion_length": 126.71875, "epoch": 1.0090957731407169, "grad_norm": 1.2167962789535522, "kl": 0.1411893665790558, "learning_rate": 4.196516048171201e-06, "loss": 0.0056, "reward": 2.291281223297119, "reward_std": 0.6646555662155151, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47878125309944153, "step": 1886 }, { "completion_length": 131.65625, "epoch": 1.0096308186195826, "grad_norm": 1.991093635559082, "kl": 0.1776863932609558, "learning_rate": 4.195372484016893e-06, "loss": 0.0071, "reward": 2.1857187747955322, "reward_std": 0.9651218056678772, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4357187747955322, "step": 1887 }, { "completion_length": 145.71875, "epoch": 1.0101658640984483, "grad_norm": 0.5840334892272949, "kl": 0.12700903415679932, "learning_rate": 4.194228262703722e-06, "loss": 0.0051, "reward": 1.737874984741211, "reward_std": 0.6732094287872314, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.45662498474121094, "step": 1888 }, { "completion_length": 155.15625, "epoch": 1.010700909577314, "grad_norm": 2.7587952613830566, "kl": 0.18960794806480408, "learning_rate": 4.193083384675211e-06, "loss": 0.0076, "reward": 1.5075937509536743, "reward_std": 0.766236424446106, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.3669687509536743, "step": 1889 }, { "completion_length": 143.28125, "epoch": 1.0112359550561798, "grad_norm": 14.504652976989746, "kl": 0.8791550397872925, "learning_rate": 4.191937850375136e-06, "loss": 0.0352, "reward": 1.7545626163482666, "reward_std": 1.148590087890625, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.42643749713897705, "step": 1890 }, { "completion_length": 128.46875, "epoch": 1.0117710005350455, "grad_norm": 1.5816761255264282, "kl": 0.28064393997192383, "learning_rate": 4.190791660247529e-06, "loss": 0.0112, "reward": 1.4870624542236328, "reward_std": 0.4400644302368164, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4714375138282776, "step": 1891 }, { "completion_length": 131.1875, "epoch": 1.0123060460139113, "grad_norm": 0.8077201843261719, "kl": 0.176093190908432, "learning_rate": 4.189644814736674e-06, "loss": 0.007, "reward": 2.6566874980926514, "reward_std": 0.636917233467102, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46918749809265137, "step": 1892 }, { "completion_length": 120.84375, "epoch": 1.0128410914927768, "grad_norm": 0.7257598638534546, "kl": 0.12759694457054138, "learning_rate": 4.18849731428711e-06, "loss": 0.0051, "reward": 2.5834062099456787, "reward_std": 1.0355875492095947, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4427812397480011, "step": 1893 }, { "completion_length": 119.96875, "epoch": 1.0133761369716425, "grad_norm": 3.0977845191955566, "kl": 0.28240370750427246, "learning_rate": 4.187349159343632e-06, "loss": 0.0113, "reward": 1.7280625104904175, "reward_std": 0.8202614784240723, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4624374806880951, "step": 1894 }, { "completion_length": 130.25, "epoch": 1.0139111824505083, "grad_norm": 1.7451823949813843, "kl": 0.4002632200717926, "learning_rate": 4.1862003503512845e-06, "loss": 0.016, "reward": 2.695187568664551, "reward_std": 0.5610666871070862, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.476437509059906, "step": 1895 }, { "completion_length": 123.59375, "epoch": 1.014446227929374, "grad_norm": 1.5408891439437866, "kl": 0.14175336062908173, "learning_rate": 4.185050887755371e-06, "loss": 0.0057, "reward": 2.1000938415527344, "reward_std": 0.6870654821395874, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4750937521457672, "step": 1896 }, { "completion_length": 137.78125, "epoch": 1.0149812734082397, "grad_norm": 1.8507267236709595, "kl": 0.1540081650018692, "learning_rate": 4.183900772001442e-06, "loss": 0.0062, "reward": 1.950124979019165, "reward_std": 0.6523096561431885, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4813750088214874, "step": 1897 }, { "completion_length": 129.34375, "epoch": 1.0155163188871055, "grad_norm": 1.3101706504821777, "kl": 0.16102594137191772, "learning_rate": 4.182750003535308e-06, "loss": 0.0064, "reward": 2.4087812900543213, "reward_std": 0.8734357953071594, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4712812304496765, "step": 1898 }, { "completion_length": 155.1875, "epoch": 1.0160513643659712, "grad_norm": 0.7649242281913757, "kl": 0.17608825862407684, "learning_rate": 4.181598582803024e-06, "loss": 0.007, "reward": 1.82631254196167, "reward_std": 0.8389162421226501, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.43568748235702515, "step": 1899 }, { "completion_length": 138.90625, "epoch": 1.0165864098448367, "grad_norm": 0.7015626430511475, "kl": 0.1411590576171875, "learning_rate": 4.180446510250907e-06, "loss": 0.0056, "reward": 2.437687397003174, "reward_std": 0.5181930065155029, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.484562486410141, "step": 1900 }, { "completion_length": 144.0, "epoch": 1.0171214553237025, "grad_norm": 1.0352612733840942, "kl": 0.1490020751953125, "learning_rate": 4.17929378632552e-06, "loss": 0.006, "reward": 1.7877187728881836, "reward_std": 1.030930995941162, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4283437430858612, "step": 1901 }, { "completion_length": 127.03125, "epoch": 1.0176565008025682, "grad_norm": 231902.0625, "kl": 52666.4921875, "learning_rate": 4.178140411473682e-06, "loss": 2106.6594, "reward": 2.296781301498413, "reward_std": 0.7734881043434143, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.4530312716960907, "step": 1902 }, { "completion_length": 126.21875, "epoch": 1.018191546281434, "grad_norm": 0.6813410520553589, "kl": 0.15030315518379211, "learning_rate": 4.176986386142464e-06, "loss": 0.006, "reward": 2.1938438415527344, "reward_std": 0.7008571624755859, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4750937521457672, "step": 1903 }, { "completion_length": 133.78125, "epoch": 1.0187265917602997, "grad_norm": 0.7854161858558655, "kl": 0.15586304664611816, "learning_rate": 4.175831710779186e-06, "loss": 0.0062, "reward": 2.1695938110351562, "reward_std": 1.0734652280807495, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4508437514305115, "step": 1904 }, { "completion_length": 139.84375, "epoch": 1.0192616372391654, "grad_norm": 0.9280032515525818, "kl": 0.1284007728099823, "learning_rate": 4.174676385831424e-06, "loss": 0.0051, "reward": 1.4291250705718994, "reward_std": 0.652008056640625, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.44475001096725464, "step": 1905 }, { "completion_length": 132.65625, "epoch": 1.019796682718031, "grad_norm": 0.5857720971107483, "kl": 0.1333703249692917, "learning_rate": 4.173520411747004e-06, "loss": 0.0053, "reward": 2.7536563873291016, "reward_std": 0.45518040657043457, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.456781268119812, "step": 1906 }, { "completion_length": 144.75, "epoch": 1.0203317281968967, "grad_norm": 1.4369254112243652, "kl": 0.16397027671337128, "learning_rate": 4.172363788974003e-06, "loss": 0.0066, "reward": 2.000093698501587, "reward_std": 1.062943458557129, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4219687581062317, "step": 1907 }, { "completion_length": 131.09375, "epoch": 1.0208667736757624, "grad_norm": 1.2707138061523438, "kl": 0.17448750138282776, "learning_rate": 4.171206517960751e-06, "loss": 0.007, "reward": 1.92578125, "reward_std": 0.6087197065353394, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.44140625, "step": 1908 }, { "completion_length": 146.28125, "epoch": 1.0214018191546281, "grad_norm": 1.7372759580612183, "kl": 0.15560105443000793, "learning_rate": 4.1700485991558275e-06, "loss": 0.0062, "reward": 1.8168437480926514, "reward_std": 0.8621686697006226, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.41059374809265137, "step": 1909 }, { "completion_length": 145.875, "epoch": 1.0219368646334939, "grad_norm": 14.09399127960205, "kl": 0.25839918851852417, "learning_rate": 4.168890033008065e-06, "loss": 0.0103, "reward": 2.3232500553131104, "reward_std": 0.7493441700935364, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.4169999957084656, "step": 1910 }, { "completion_length": 151.34375, "epoch": 1.0224719101123596, "grad_norm": 0.8162753582000732, "kl": 0.11498238146305084, "learning_rate": 4.167730819966545e-06, "loss": 0.0046, "reward": 2.0215625762939453, "reward_std": 0.9409189820289612, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.45906251668930054, "step": 1911 }, { "completion_length": 127.0, "epoch": 1.0230069555912253, "grad_norm": 0.8063371777534485, "kl": 0.18116340041160583, "learning_rate": 4.166570960480601e-06, "loss": 0.0072, "reward": 2.5475311279296875, "reward_std": 0.7432681918144226, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.43815624713897705, "step": 1912 }, { "completion_length": 131.1875, "epoch": 1.0235420010700909, "grad_norm": 0.5152719616889954, "kl": 0.14251428842544556, "learning_rate": 4.165410454999817e-06, "loss": 0.0057, "reward": 2.4699063301086426, "reward_std": 0.39978331327438354, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4542812407016754, "step": 1913 }, { "completion_length": 143.96875, "epoch": 1.0240770465489566, "grad_norm": 3.1840462684631348, "kl": 0.3558053970336914, "learning_rate": 4.1642493039740274e-06, "loss": 0.0142, "reward": 1.7759374380111694, "reward_std": 0.9682320356369019, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4321874976158142, "step": 1914 }, { "completion_length": 150.65625, "epoch": 1.0246120920278223, "grad_norm": 8507301.0, "kl": 383977.125, "learning_rate": 4.163087507853315e-06, "loss": 15359.0859, "reward": 1.8709688186645508, "reward_std": 1.0683706998825073, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.417843759059906, "step": 1915 }, { "completion_length": 144.375, "epoch": 1.025147137506688, "grad_norm": 3.2958357334136963, "kl": 0.21924638748168945, "learning_rate": 4.161925067088017e-06, "loss": 0.0088, "reward": 1.9751250743865967, "reward_std": 0.904320240020752, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4126250147819519, "step": 1916 }, { "completion_length": 161.5625, "epoch": 1.0256821829855538, "grad_norm": 0.8399247527122498, "kl": 0.12619595229625702, "learning_rate": 4.1607619821287155e-06, "loss": 0.005, "reward": 1.5436562299728394, "reward_std": 0.9987636208534241, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.40303125977516174, "step": 1917 }, { "completion_length": 127.875, "epoch": 1.0262172284644195, "grad_norm": 4.436149597167969, "kl": 0.20729273557662964, "learning_rate": 4.159598253426245e-06, "loss": 0.0083, "reward": 2.580937385559082, "reward_std": 0.6645864248275757, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4715625047683716, "step": 1918 }, { "completion_length": 136.96875, "epoch": 1.0267522739432853, "grad_norm": 1.535443663597107, "kl": 0.16925668716430664, "learning_rate": 4.15843388143169e-06, "loss": 0.0068, "reward": 2.28125, "reward_std": 0.7430055737495422, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.5, "step": 1919 }, { "completion_length": 127.84375, "epoch": 1.0272873194221508, "grad_norm": 0.690868079662323, "kl": 0.1757861077785492, "learning_rate": 4.157268866596381e-06, "loss": 0.007, "reward": 1.8124375343322754, "reward_std": 0.3562646508216858, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.484312504529953, "step": 1920 }, { "completion_length": 138.625, "epoch": 1.0278223649010165, "grad_norm": 1.2230720520019531, "kl": 0.2375502735376358, "learning_rate": 4.156103209371903e-06, "loss": 0.0095, "reward": 1.6795001029968262, "reward_std": 0.4900033473968506, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.476375013589859, "step": 1921 }, { "completion_length": 138.625, "epoch": 1.0283574103798823, "grad_norm": 1.0810774564743042, "kl": 0.15402758121490479, "learning_rate": 4.1549369102100854e-06, "loss": 0.0062, "reward": 1.751312494277954, "reward_std": 0.6899203658103943, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4700624942779541, "step": 1922 }, { "completion_length": 158.4375, "epoch": 1.028892455858748, "grad_norm": 0.7567098736763, "kl": 0.11496575176715851, "learning_rate": 4.153769969563008e-06, "loss": 0.0046, "reward": 1.325624942779541, "reward_std": 0.6685399413108826, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4193750023841858, "step": 1923 }, { "completion_length": 141.40625, "epoch": 1.0294275013376137, "grad_norm": 1.3517814874649048, "kl": 0.16432547569274902, "learning_rate": 4.1526023878830015e-06, "loss": 0.0066, "reward": 1.8071250915527344, "reward_std": 0.7850834131240845, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4790000021457672, "step": 1924 }, { "completion_length": 150.375, "epoch": 1.0299625468164795, "grad_norm": 1.436967134475708, "kl": 0.3681671619415283, "learning_rate": 4.1514341656226394e-06, "loss": 0.0147, "reward": 1.8507499694824219, "reward_std": 0.9299060106277466, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.44449999928474426, "step": 1925 }, { "completion_length": 151.34375, "epoch": 1.0304975922953452, "grad_norm": 0.9496452808380127, "kl": 0.19968360662460327, "learning_rate": 4.15026530323475e-06, "loss": 0.008, "reward": 1.7714687585830688, "reward_std": 0.9705916047096252, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.41209375858306885, "step": 1926 }, { "completion_length": 160.9375, "epoch": 1.0310326377742107, "grad_norm": 0.8150053024291992, "kl": 0.13630470633506775, "learning_rate": 4.149095801172405e-06, "loss": 0.0055, "reward": 1.2850937843322754, "reward_std": 0.5797086358070374, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.441343754529953, "step": 1927 }, { "completion_length": 116.0625, "epoch": 1.0315676832530765, "grad_norm": 1.2399697303771973, "kl": 0.1950991302728653, "learning_rate": 4.147925659888927e-06, "loss": 0.0078, "reward": 2.483687400817871, "reward_std": 0.8134075403213501, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.49931249022483826, "step": 1928 }, { "completion_length": 131.96875, "epoch": 1.0321027287319422, "grad_norm": 2.064646005630493, "kl": 0.22841954231262207, "learning_rate": 4.146754879837885e-06, "loss": 0.0091, "reward": 1.9162499904632568, "reward_std": 0.5968930125236511, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.43187499046325684, "step": 1929 }, { "completion_length": 160.71875, "epoch": 1.032637774210808, "grad_norm": 0.761561393737793, "kl": 0.14128559827804565, "learning_rate": 4.145583461473095e-06, "loss": 0.0057, "reward": 1.4589375257492065, "reward_std": 0.9941895008087158, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.25, "rewards/xmlcount_reward_func": 0.36518749594688416, "step": 1930 }, { "completion_length": 145.125, "epoch": 1.0331728196896737, "grad_norm": 0.7392420172691345, "kl": 0.17847448587417603, "learning_rate": 4.144411405248621e-06, "loss": 0.0071, "reward": 1.627343773841858, "reward_std": 0.9545502662658691, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4554687440395355, "step": 1931 }, { "completion_length": 133.84375, "epoch": 1.0337078651685394, "grad_norm": 0.7613722085952759, "kl": 0.17125779390335083, "learning_rate": 4.143238711618775e-06, "loss": 0.0069, "reward": 1.77734375, "reward_std": 0.9877244830131531, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46484375, "step": 1932 }, { "completion_length": 146.625, "epoch": 1.034242910647405, "grad_norm": 0.5000748038291931, "kl": 0.1613740175962448, "learning_rate": 4.1420653810381175e-06, "loss": 0.0065, "reward": 2.34765625, "reward_std": 0.8576945066452026, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.48828125, "step": 1933 }, { "completion_length": 129.96875, "epoch": 1.0347779561262707, "grad_norm": 1.7438886165618896, "kl": 0.40383678674697876, "learning_rate": 4.140891413961451e-06, "loss": 0.0162, "reward": 2.7499375343322754, "reward_std": 0.7962584495544434, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.499937504529953, "step": 1934 }, { "completion_length": 130.625, "epoch": 1.0353130016051364, "grad_norm": 0.9529166221618652, "kl": 0.2027619481086731, "learning_rate": 4.139716810843829e-06, "loss": 0.0081, "reward": 2.1253437995910645, "reward_std": 0.9612632989883423, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4690937399864197, "step": 1935 }, { "completion_length": 130.5, "epoch": 1.0358480470840021, "grad_norm": 1.4690687656402588, "kl": 0.21641018986701965, "learning_rate": 4.138541572140549e-06, "loss": 0.0087, "reward": 2.2730000019073486, "reward_std": 1.2943400144577026, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.44487500190734863, "step": 1936 }, { "completion_length": 130.9375, "epoch": 1.0363830925628679, "grad_norm": 0.6751450300216675, "kl": 0.1838127225637436, "learning_rate": 4.137365698307157e-06, "loss": 0.0074, "reward": 1.9720312356948853, "reward_std": 0.9243003129959106, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.47203123569488525, "step": 1937 }, { "completion_length": 114.25, "epoch": 1.0369181380417336, "grad_norm": 6.98576021194458, "kl": 0.3232756555080414, "learning_rate": 4.136189189799444e-06, "loss": 0.0129, "reward": 2.757312536239624, "reward_std": 0.9888120889663696, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.49168750643730164, "step": 1938 }, { "completion_length": 135.5625, "epoch": 1.0374531835205993, "grad_norm": 0.8119953274726868, "kl": 0.18097740411758423, "learning_rate": 4.135012047073449e-06, "loss": 0.0072, "reward": 1.7337499856948853, "reward_std": 0.8925206661224365, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46812498569488525, "step": 1939 }, { "completion_length": 156.90625, "epoch": 1.0379882289994649, "grad_norm": 1.7863470315933228, "kl": 0.11168848723173141, "learning_rate": 4.1338342705854515e-06, "loss": 0.0045, "reward": 1.8305000066757202, "reward_std": 0.6421513557434082, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.3930000066757202, "step": 1940 }, { "completion_length": 119.375, "epoch": 1.0385232744783306, "grad_norm": 0.6967492699623108, "kl": 0.14020578563213348, "learning_rate": 4.132655860791983e-06, "loss": 0.0056, "reward": 2.633125066757202, "reward_std": 0.7847118377685547, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4925000071525574, "step": 1941 }, { "completion_length": 153.8125, "epoch": 1.0390583199571963, "grad_norm": 2.1242265701293945, "kl": 0.15527480840682983, "learning_rate": 4.131476818149817e-06, "loss": 0.0062, "reward": 1.7419999837875366, "reward_std": 0.8746082186698914, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.3982499837875366, "step": 1942 }, { "completion_length": 134.125, "epoch": 1.039593365436062, "grad_norm": 0.866944432258606, "kl": 0.23915992677211761, "learning_rate": 4.130297143115974e-06, "loss": 0.0096, "reward": 2.3847813606262207, "reward_std": 0.8393886089324951, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47853124141693115, "step": 1943 }, { "completion_length": 139.0, "epoch": 1.0401284109149278, "grad_norm": 2.0919313430786133, "kl": 0.17634305357933044, "learning_rate": 4.129116836147715e-06, "loss": 0.0071, "reward": 1.7343125343322754, "reward_std": 0.6483592987060547, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.453062504529953, "step": 1944 }, { "completion_length": 141.96875, "epoch": 1.0406634563937935, "grad_norm": 0.8187460899353027, "kl": 0.1643836796283722, "learning_rate": 4.127935897702554e-06, "loss": 0.0066, "reward": 1.8984375, "reward_std": 0.8932957649230957, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4765625, "step": 1945 }, { "completion_length": 144.875, "epoch": 1.0411985018726593, "grad_norm": 0.886022686958313, "kl": 0.18476736545562744, "learning_rate": 4.126754328238243e-06, "loss": 0.0074, "reward": 2.011593818664551, "reward_std": 0.8142080903053284, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.464718759059906, "step": 1946 }, { "completion_length": 149.90625, "epoch": 1.0417335473515248, "grad_norm": 1.1943895816802979, "kl": 0.15324078500270844, "learning_rate": 4.125572128212781e-06, "loss": 0.0061, "reward": 1.651687502861023, "reward_std": 1.059070348739624, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.38606247305870056, "step": 1947 }, { "completion_length": 137.53125, "epoch": 1.0422685928303905, "grad_norm": 0.993851363658905, "kl": 0.22248542308807373, "learning_rate": 4.124389298084413e-06, "loss": 0.0089, "reward": 1.8858749866485596, "reward_std": 0.7142146825790405, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46399998664855957, "step": 1948 }, { "completion_length": 122.34375, "epoch": 1.0428036383092563, "grad_norm": 1.0114508867263794, "kl": 0.19383741915225983, "learning_rate": 4.123205838311625e-06, "loss": 0.0078, "reward": 2.459749937057495, "reward_std": 1.1573102474212646, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4909999966621399, "step": 1949 }, { "completion_length": 152.78125, "epoch": 1.043338683788122, "grad_norm": 3.342052459716797, "kl": 0.42907634377479553, "learning_rate": 4.12202174935315e-06, "loss": 0.0172, "reward": 1.71484375, "reward_std": 1.1241166591644287, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.41796875, "step": 1950 }, { "completion_length": 139.625, "epoch": 1.0438737292669877, "grad_norm": 0.5641137957572937, "kl": 0.1628257781267166, "learning_rate": 4.1208370316679615e-06, "loss": 0.0065, "reward": 1.7560625076293945, "reward_std": 0.7587039470672607, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.45918750762939453, "step": 1951 }, { "completion_length": 147.5, "epoch": 1.0444087747458535, "grad_norm": 0.863126814365387, "kl": 0.1344728171825409, "learning_rate": 4.11965168571528e-06, "loss": 0.0054, "reward": 2.3319687843322754, "reward_std": 1.0508649349212646, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.488218754529953, "step": 1952 }, { "completion_length": 118.25, "epoch": 1.0449438202247192, "grad_norm": 1.3607151508331299, "kl": 0.17765159904956818, "learning_rate": 4.11846571195457e-06, "loss": 0.0071, "reward": 2.3540313243865967, "reward_std": 0.7111170291900635, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4790312647819519, "step": 1953 }, { "completion_length": 136.1875, "epoch": 1.0454788657035847, "grad_norm": 3.0090274810791016, "kl": 0.592434287071228, "learning_rate": 4.117279110845535e-06, "loss": 0.0237, "reward": 1.9159061908721924, "reward_std": 0.7291907668113708, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.46278125047683716, "step": 1954 }, { "completion_length": 119.09375, "epoch": 1.0460139111824505, "grad_norm": 7199.29150390625, "kl": 41.733558654785156, "learning_rate": 4.116091882848125e-06, "loss": 1.6693, "reward": 1.98828125, "reward_std": 0.7861577272415161, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48828125, "step": 1955 }, { "completion_length": 112.3125, "epoch": 1.0465489566613162, "grad_norm": 1.0885671377182007, "kl": 0.19171124696731567, "learning_rate": 4.114904028422533e-06, "loss": 0.0077, "reward": 2.140500068664551, "reward_std": 0.7535490393638611, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.499875009059906, "step": 1956 }, { "completion_length": 130.65625, "epoch": 1.047084002140182, "grad_norm": 0.9815663695335388, "kl": 0.17075444757938385, "learning_rate": 4.113715548029193e-06, "loss": 0.0068, "reward": 1.9909374713897705, "reward_std": 0.584769606590271, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4596875011920929, "step": 1957 }, { "completion_length": 152.28125, "epoch": 1.0476190476190477, "grad_norm": 23890690048.0, "kl": 33305696.0, "learning_rate": 4.112526442128785e-06, "loss": 1332227.75, "reward": 0.9849063158035278, "reward_std": 0.360092431306839, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.1875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.42240625619888306, "step": 1958 }, { "completion_length": 144.34375, "epoch": 1.0481540930979134, "grad_norm": 0.8596622347831726, "kl": 0.16628983616828918, "learning_rate": 4.111336711182227e-06, "loss": 0.0067, "reward": 1.4679999351501465, "reward_std": 0.8020339012145996, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.43674999475479126, "step": 1959 }, { "completion_length": 123.15625, "epoch": 1.048689138576779, "grad_norm": 1.7120943069458008, "kl": 0.14538854360580444, "learning_rate": 4.110146355650682e-06, "loss": 0.0058, "reward": 2.2989375591278076, "reward_std": 0.5506631135940552, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.28125, "rewards/xmlcount_reward_func": 0.43956249952316284, "step": 1960 }, { "completion_length": 143.40625, "epoch": 1.0492241840556447, "grad_norm": 4.791566371917725, "kl": 0.17799390852451324, "learning_rate": 4.1089553759955555e-06, "loss": 0.0071, "reward": 1.648031234741211, "reward_std": 0.7734537124633789, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46053123474121094, "step": 1961 }, { "completion_length": 138.4375, "epoch": 1.0497592295345104, "grad_norm": 0.8216515183448792, "kl": 0.1391848772764206, "learning_rate": 4.107763772678494e-06, "loss": 0.0056, "reward": 1.839593768119812, "reward_std": 0.7211884260177612, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.480218768119812, "step": 1962 }, { "completion_length": 150.09375, "epoch": 1.0502942750133761, "grad_norm": 0.7838181257247925, "kl": 0.15301164984703064, "learning_rate": 4.106571546161384e-06, "loss": 0.0061, "reward": 1.94140625, "reward_std": 0.7512303590774536, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.48828125, "step": 1963 }, { "completion_length": 124.5, "epoch": 1.0508293204922419, "grad_norm": 0.8030720949172974, "kl": 0.15948271751403809, "learning_rate": 4.105378696906358e-06, "loss": 0.0064, "reward": 2.2265625, "reward_std": 0.8907632827758789, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4921875, "step": 1964 }, { "completion_length": 115.65625, "epoch": 1.0513643659711076, "grad_norm": 3.674408435821533, "kl": 0.19354525208473206, "learning_rate": 4.104185225375784e-06, "loss": 0.0077, "reward": 2.4665937423706055, "reward_std": 1.0396945476531982, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48221874237060547, "step": 1965 }, { "completion_length": 148.40625, "epoch": 1.0518994114499733, "grad_norm": 2.801992177963257, "kl": 0.3619512617588043, "learning_rate": 4.102991132032278e-06, "loss": 0.0145, "reward": 2.014812469482422, "reward_std": 0.8209424018859863, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.43668749928474426, "step": 1966 }, { "completion_length": 133.0, "epoch": 1.0524344569288389, "grad_norm": 2.091053009033203, "kl": 0.15598216652870178, "learning_rate": 4.101796417338691e-06, "loss": 0.0062, "reward": 2.4254062175750732, "reward_std": 0.7591388821601868, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.48790624737739563, "step": 1967 }, { "completion_length": 146.34375, "epoch": 1.0529695024077046, "grad_norm": 0.6089209318161011, "kl": 0.15714429318904877, "learning_rate": 4.1006010817581175e-06, "loss": 0.0063, "reward": 1.4883437156677246, "reward_std": 0.7969777584075928, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.425843745470047, "step": 1968 }, { "completion_length": 137.9375, "epoch": 1.0535045478865703, "grad_norm": 273118400.0, "kl": 394078.875, "learning_rate": 4.099405125753894e-06, "loss": 15763.1543, "reward": 2.5067811012268066, "reward_std": 1.0394580364227295, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.4911562502384186, "step": 1969 }, { "completion_length": 106.0, "epoch": 1.054039593365436, "grad_norm": 0.859656810760498, "kl": 0.2139473408460617, "learning_rate": 4.098208549789594e-06, "loss": 0.0086, "reward": 1.636312484741211, "reward_std": 0.5958877205848694, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4800625145435333, "step": 1970 }, { "completion_length": 125.875, "epoch": 1.0545746388443018, "grad_norm": 1.609183430671692, "kl": 0.16762882471084595, "learning_rate": 4.097011354329035e-06, "loss": 0.0067, "reward": 1.9995625019073486, "reward_std": 1.0094425678253174, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.49956250190734863, "step": 1971 }, { "completion_length": 112.6875, "epoch": 1.0551096843231675, "grad_norm": 155065184.0, "kl": 32557388.0, "learning_rate": 4.095813539836272e-06, "loss": 1302295.5, "reward": 1.8861563205718994, "reward_std": 0.7353253960609436, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.47990626096725464, "step": 1972 }, { "completion_length": 145.9375, "epoch": 1.0556447298020333, "grad_norm": 0.6764001250267029, "kl": 0.15319909155368805, "learning_rate": 4.094615106775601e-06, "loss": 0.0061, "reward": 1.6269062757492065, "reward_std": 0.8263756632804871, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.37690627574920654, "step": 1973 }, { "completion_length": 154.5625, "epoch": 1.0561797752808988, "grad_norm": 0.9062497615814209, "kl": 0.12957407534122467, "learning_rate": 4.093416055611558e-06, "loss": 0.0052, "reward": 1.5504062175750732, "reward_std": 0.7735299468040466, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.47228124737739563, "step": 1974 }, { "completion_length": 129.71875, "epoch": 1.0567148207597645, "grad_norm": 0.9707196950912476, "kl": 0.18924209475517273, "learning_rate": 4.092216386808917e-06, "loss": 0.0076, "reward": 1.6595312356948853, "reward_std": 0.8290979266166687, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.48765623569488525, "step": 1975 }, { "completion_length": 126.40625, "epoch": 1.0572498662386303, "grad_norm": 6955.37255859375, "kl": 5.034479141235352, "learning_rate": 4.091016100832693e-06, "loss": 0.2014, "reward": 2.3340938091278076, "reward_std": 0.4320604205131531, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.49034374952316284, "step": 1976 }, { "completion_length": 132.125, "epoch": 1.057784911717496, "grad_norm": 0.625192403793335, "kl": 0.1465221643447876, "learning_rate": 4.089815198148142e-06, "loss": 0.0059, "reward": 2.9375, "reward_std": 0.694325864315033, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 1977 }, { "completion_length": 129.5, "epoch": 1.0583199571963617, "grad_norm": 2.1221370697021484, "kl": 0.14699549973011017, "learning_rate": 4.088613679220754e-06, "loss": 0.0059, "reward": 2.609375, "reward_std": 0.7291996479034424, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.5, "step": 1978 }, { "completion_length": 138.53125, "epoch": 1.0588550026752275, "grad_norm": 2.8964898586273193, "kl": 0.17053189873695374, "learning_rate": 4.087411544516262e-06, "loss": 0.0068, "reward": 1.8063125610351562, "reward_std": 0.7312458753585815, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4625625014305115, "step": 1979 }, { "completion_length": 127.78125, "epoch": 1.0593900481540932, "grad_norm": 1.4565876722335815, "kl": 0.1675315499305725, "learning_rate": 4.086208794500637e-06, "loss": 0.0067, "reward": 2.057687520980835, "reward_std": 0.7300370931625366, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46393752098083496, "step": 1980 }, { "completion_length": 142.5, "epoch": 1.0599250936329587, "grad_norm": 0.7506701946258545, "kl": 0.13147960603237152, "learning_rate": 4.0850054296400875e-06, "loss": 0.0053, "reward": 2.01171875, "reward_std": 0.4341123700141907, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.41796875, "step": 1981 }, { "completion_length": 145.59375, "epoch": 1.0604601391118245, "grad_norm": 0.8178697824478149, "kl": 0.17403164505958557, "learning_rate": 4.0838014504010605e-06, "loss": 0.007, "reward": 1.8448749780654907, "reward_std": 0.9354063272476196, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4230000078678131, "step": 1982 }, { "completion_length": 133.40625, "epoch": 1.0609951845906902, "grad_norm": 1.5497455596923828, "kl": 0.25288087129592896, "learning_rate": 4.08259685725024e-06, "loss": 0.0101, "reward": 1.7978750467300415, "reward_std": 0.5061144828796387, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4853750169277191, "step": 1983 }, { "completion_length": 154.15625, "epoch": 1.061530230069556, "grad_norm": 0.6376597285270691, "kl": 0.12139758467674255, "learning_rate": 4.081391650654553e-06, "loss": 0.0049, "reward": 2.0621249675750732, "reward_std": 1.155389428138733, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.45274999737739563, "step": 1984 }, { "completion_length": 127.125, "epoch": 1.0620652755484217, "grad_norm": 0.8682750463485718, "kl": 0.20004212856292725, "learning_rate": 4.080185831081157e-06, "loss": 0.008, "reward": 2.4921875, "reward_std": 0.8584929704666138, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4765625, "step": 1985 }, { "completion_length": 153.71875, "epoch": 1.0626003210272874, "grad_norm": 2.9992547035217285, "kl": 0.2343507558107376, "learning_rate": 4.0789793989974536e-06, "loss": 0.0094, "reward": 2.326531171798706, "reward_std": 0.769955039024353, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.45153123140335083, "step": 1986 }, { "completion_length": 134.34375, "epoch": 1.063135366506153, "grad_norm": 1.0115300416946411, "kl": 0.17155048251152039, "learning_rate": 4.0777723548710764e-06, "loss": 0.0069, "reward": 2.66015625, "reward_std": 0.975351095199585, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.48828125, "step": 1987 }, { "completion_length": 114.15625, "epoch": 1.0636704119850187, "grad_norm": 0.9723459482192993, "kl": 0.20270609855651855, "learning_rate": 4.076564699169902e-06, "loss": 0.0081, "reward": 2.42578125, "reward_std": 0.46302366256713867, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48828125, "step": 1988 }, { "completion_length": 133.5, "epoch": 1.0642054574638844, "grad_norm": 0.5620776414871216, "kl": 0.16528698801994324, "learning_rate": 4.0753564323620375e-06, "loss": 0.0066, "reward": 1.4432812929153442, "reward_std": 0.4480164647102356, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.45890623331069946, "step": 1989 }, { "completion_length": 107.375, "epoch": 1.0647405029427501, "grad_norm": 0.7406893968582153, "kl": 0.1478179693222046, "learning_rate": 4.074147554915833e-06, "loss": 0.0059, "reward": 1.97837495803833, "reward_std": 0.95167076587677, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.49400001764297485, "step": 1990 }, { "completion_length": 141.0, "epoch": 1.0652755484216159, "grad_norm": 1.042427897453308, "kl": 0.18165206909179688, "learning_rate": 4.072938067299871e-06, "loss": 0.0073, "reward": 1.866687536239624, "reward_std": 0.868912935256958, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.44481250643730164, "step": 1991 }, { "completion_length": 139.625, "epoch": 1.0658105939004816, "grad_norm": 0.9226694107055664, "kl": 0.1437678039073944, "learning_rate": 4.071727969982973e-06, "loss": 0.0058, "reward": 1.9758124351501465, "reward_std": 0.932772159576416, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47581249475479126, "step": 1992 }, { "completion_length": 134.8125, "epoch": 1.0663456393793473, "grad_norm": 1.031589388847351, "kl": 0.22752922773361206, "learning_rate": 4.070517263434195e-06, "loss": 0.0091, "reward": 2.0958125591278076, "reward_std": 0.9279689788818359, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.45518749952316284, "step": 1993 }, { "completion_length": 140.375, "epoch": 1.0668806848582129, "grad_norm": 59.73602294921875, "kl": 4.921256065368652, "learning_rate": 4.06930594812283e-06, "loss": 0.1969, "reward": 1.87890625, "reward_std": 1.0300734043121338, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.48828125, "step": 1994 }, { "completion_length": 126.1875, "epoch": 1.0674157303370786, "grad_norm": 0.8514909148216248, "kl": 0.16683387756347656, "learning_rate": 4.068094024518408e-06, "loss": 0.0067, "reward": 2.06640625, "reward_std": 0.43620848655700684, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.48828125, "step": 1995 }, { "completion_length": 125.71875, "epoch": 1.0679507758159443, "grad_norm": 695777.1875, "kl": 1529.0673828125, "learning_rate": 4.066881493090693e-06, "loss": 61.1627, "reward": 2.0003437995910645, "reward_std": 0.6736217141151428, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4690937399864197, "step": 1996 }, { "completion_length": 139.5, "epoch": 1.06848582129481, "grad_norm": 1.4143893718719482, "kl": 0.37785130739212036, "learning_rate": 4.065668354309686e-06, "loss": 0.0151, "reward": 2.023312568664551, "reward_std": 0.9017601013183594, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.476437509059906, "step": 1997 }, { "completion_length": 135.65625, "epoch": 1.0690208667736758, "grad_norm": 0.8834018111228943, "kl": 0.16904893517494202, "learning_rate": 4.064454608645622e-06, "loss": 0.0068, "reward": 2.183468818664551, "reward_std": 0.7277542948722839, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.464718759059906, "step": 1998 }, { "completion_length": 150.375, "epoch": 1.0695559122525415, "grad_norm": 0.8574283123016357, "kl": 0.1457737237215042, "learning_rate": 4.063240256568974e-06, "loss": 0.0058, "reward": 1.7492499351501465, "reward_std": 0.8399232029914856, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.43674999475479126, "step": 1999 }, { "completion_length": 132.9375, "epoch": 1.070090957731407, "grad_norm": 0.8399838209152222, "kl": 0.18371686339378357, "learning_rate": 4.062025298550446e-06, "loss": 0.0073, "reward": 1.5958750247955322, "reward_std": 0.7254650592803955, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.47087499499320984, "step": 2000 }, { "completion_length": 138.8125, "epoch": 1.0706260032102728, "grad_norm": 27.237531661987305, "kl": 1.1699259281158447, "learning_rate": 4.06080973506098e-06, "loss": 0.0468, "reward": 1.6786562204360962, "reward_std": 1.1499720811843872, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.41303128004074097, "step": 2001 }, { "completion_length": 145.71875, "epoch": 1.0711610486891385, "grad_norm": 1.147694706916809, "kl": 0.13484397530555725, "learning_rate": 4.059593566571751e-06, "loss": 0.0054, "reward": 1.9921875, "reward_std": 0.5078784227371216, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.3828125, "step": 2002 }, { "completion_length": 131.5625, "epoch": 1.0716960941680043, "grad_norm": 13.640114784240723, "kl": 0.7539434432983398, "learning_rate": 4.0583767935541696e-06, "loss": 0.0302, "reward": 1.6271250247955322, "reward_std": 0.8473231792449951, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.45524999499320984, "step": 2003 }, { "completion_length": 141.9375, "epoch": 1.07223113964687, "grad_norm": 0.7118578553199768, "kl": 0.14024733006954193, "learning_rate": 4.0571594164798815e-06, "loss": 0.0056, "reward": 2.4145312309265137, "reward_std": 1.0188210010528564, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.44578126072883606, "step": 2004 }, { "completion_length": 144.0, "epoch": 1.0727661851257357, "grad_norm": 21219302.0, "kl": 742499.0, "learning_rate": 4.055941435820763e-06, "loss": 29699.959, "reward": 2.0109686851501465, "reward_std": 0.8854811787605286, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.44846874475479126, "step": 2005 }, { "completion_length": 126.5625, "epoch": 1.0733012306046015, "grad_norm": 1264967552.0, "kl": 49846200.0, "learning_rate": 4.054722852048929e-06, "loss": 1993848.0, "reward": 1.96875, "reward_std": 0.871952474117279, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.5, "step": 2006 }, { "completion_length": 128.1875, "epoch": 1.0738362760834672, "grad_norm": 0.6325451731681824, "kl": 0.16457343101501465, "learning_rate": 4.053503665636724e-06, "loss": 0.0066, "reward": 2.38671875, "reward_std": 0.7599803805351257, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48046875, "step": 2007 }, { "completion_length": 123.875, "epoch": 1.0743713215623327, "grad_norm": 0.996271014213562, "kl": 0.16292497515678406, "learning_rate": 4.05228387705673e-06, "loss": 0.0065, "reward": 2.089750051498413, "reward_std": 0.5281733870506287, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4647499918937683, "step": 2008 }, { "completion_length": 149.46875, "epoch": 1.0749063670411985, "grad_norm": 0.6159361004829407, "kl": 0.12939408421516418, "learning_rate": 4.05106348678176e-06, "loss": 0.0052, "reward": 1.9983749389648438, "reward_std": 0.9265321493148804, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4046249985694885, "step": 2009 }, { "completion_length": 116.78125, "epoch": 1.0754414125200642, "grad_norm": 1.1323527097702026, "kl": 0.19781284034252167, "learning_rate": 4.049842495284858e-06, "loss": 0.0079, "reward": 2.343625068664551, "reward_std": 1.0623657703399658, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.499875009059906, "step": 2010 }, { "completion_length": 136.5, "epoch": 1.07597645799893, "grad_norm": 1.67817223072052, "kl": 0.1802271604537964, "learning_rate": 4.048620903039308e-06, "loss": 0.0072, "reward": 1.683437466621399, "reward_std": 0.7885217070579529, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4334374964237213, "step": 2011 }, { "completion_length": 126.8125, "epoch": 1.0765115034777957, "grad_norm": 0.8062768578529358, "kl": 0.13898326456546783, "learning_rate": 4.047398710518619e-06, "loss": 0.0056, "reward": 2.309499979019165, "reward_std": 0.7408764362335205, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4657500088214874, "step": 2012 }, { "completion_length": 146.09375, "epoch": 1.0770465489566614, "grad_norm": 0.6278332471847534, "kl": 0.10968858748674393, "learning_rate": 4.0461759181965375e-06, "loss": 0.0044, "reward": 1.4957187175750732, "reward_std": 0.691283643245697, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.401968777179718, "step": 2013 }, { "completion_length": 136.34375, "epoch": 1.077581594435527, "grad_norm": 0.8521351218223572, "kl": 0.17936541140079498, "learning_rate": 4.044952526547042e-06, "loss": 0.0072, "reward": 1.42578125, "reward_std": 0.7256107330322266, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.25, "rewards/xmlcount_reward_func": 0.39453125, "step": 2014 }, { "completion_length": 151.46875, "epoch": 1.0781166399143927, "grad_norm": 1.5108232498168945, "kl": 0.1253524273633957, "learning_rate": 4.043728536044342e-06, "loss": 0.005, "reward": 1.5333750247955322, "reward_std": 1.131994366645813, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.37712499499320984, "step": 2015 }, { "completion_length": 113.1875, "epoch": 1.0786516853932584, "grad_norm": 1867.4586181640625, "kl": 364.42498779296875, "learning_rate": 4.04250394716288e-06, "loss": 14.577, "reward": 2.356374979019165, "reward_std": 0.7398722171783447, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4970000088214874, "step": 2016 }, { "completion_length": 140.46875, "epoch": 1.0791867308721241, "grad_norm": 587512448.0, "kl": 1069342.75, "learning_rate": 4.041278760377328e-06, "loss": 42773.7109, "reward": 1.688812494277954, "reward_std": 0.589330792427063, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4231874942779541, "step": 2017 }, { "completion_length": 151.34375, "epoch": 1.0797217763509899, "grad_norm": 1.595368504524231, "kl": 0.20556268095970154, "learning_rate": 4.040052976162595e-06, "loss": 0.0082, "reward": 1.3379688262939453, "reward_std": 0.6522570848464966, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.44734376668930054, "step": 2018 }, { "completion_length": 121.0625, "epoch": 1.0802568218298556, "grad_norm": 0.22580236196517944, "kl": 0.13806220889091492, "learning_rate": 4.038826594993817e-06, "loss": 0.0055, "reward": 2.7002501487731934, "reward_std": 0.47024595737457275, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4658749997615814, "step": 2019 }, { "completion_length": 155.625, "epoch": 1.0807918673087213, "grad_norm": 0.8240882158279419, "kl": 0.10483259707689285, "learning_rate": 4.037599617346362e-06, "loss": 0.0042, "reward": 1.984375, "reward_std": 0.6826430559158325, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.375, "step": 2020 }, { "completion_length": 137.15625, "epoch": 1.0813269127875869, "grad_norm": 0.9997404217720032, "kl": 0.14697052538394928, "learning_rate": 4.036372043695831e-06, "loss": 0.0059, "reward": 1.3490312099456787, "reward_std": 0.6556794047355652, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4271562695503235, "step": 2021 }, { "completion_length": 143.6875, "epoch": 1.0818619582664526, "grad_norm": 1.696716547012329, "kl": 0.16171389818191528, "learning_rate": 4.035143874518055e-06, "loss": 0.0065, "reward": 1.6911250352859497, "reward_std": 0.6458355188369751, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4723750054836273, "step": 2022 }, { "completion_length": 115.5, "epoch": 1.0823970037453183, "grad_norm": 0.7018054723739624, "kl": 0.15967398881912231, "learning_rate": 4.033915110289096e-06, "loss": 0.0064, "reward": 1.8398749828338623, "reward_std": 0.7257501482963562, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4961249828338623, "step": 2023 }, { "completion_length": 133.0, "epoch": 1.082932049224184, "grad_norm": 1.6375508308410645, "kl": 0.21676868200302124, "learning_rate": 4.0326857514852466e-06, "loss": 0.0087, "reward": 2.6136562824249268, "reward_std": 1.1553106307983398, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48865625262260437, "step": 2024 }, { "completion_length": 110.15625, "epoch": 1.0834670947030498, "grad_norm": 1.0761666297912598, "kl": 0.21945872902870178, "learning_rate": 4.03145579858303e-06, "loss": 0.0088, "reward": 2.2878124713897705, "reward_std": 1.1048285961151123, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4753125011920929, "step": 2025 }, { "completion_length": 140.375, "epoch": 1.0840021401819155, "grad_norm": 0.9300815463066101, "kl": 0.1600717008113861, "learning_rate": 4.030225252059199e-06, "loss": 0.0064, "reward": 2.086250066757202, "reward_std": 0.860694408416748, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4456250071525574, "step": 2026 }, { "completion_length": 150.75, "epoch": 1.084537185660781, "grad_norm": 0.9350849390029907, "kl": 0.1344948410987854, "learning_rate": 4.028994112390736e-06, "loss": 0.0054, "reward": 1.468343734741211, "reward_std": 0.5547541379928589, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.46834373474121094, "step": 2027 }, { "completion_length": 117.0625, "epoch": 1.0850722311396468, "grad_norm": 1.7642186880111694, "kl": 0.16911935806274414, "learning_rate": 4.027762380054857e-06, "loss": 0.0068, "reward": 2.421875, "reward_std": 0.7677386999130249, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.5, "step": 2028 }, { "completion_length": 145.6875, "epoch": 1.0856072766185125, "grad_norm": 1.0593698024749756, "kl": 0.1304299533367157, "learning_rate": 4.026530055529002e-06, "loss": 0.0052, "reward": 2.1989688873291016, "reward_std": 0.8897804021835327, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.464593768119812, "step": 2029 }, { "completion_length": 140.375, "epoch": 1.0861423220973783, "grad_norm": 1.8625104427337646, "kl": 0.1956632435321808, "learning_rate": 4.025297139290847e-06, "loss": 0.0078, "reward": 1.3170937299728394, "reward_std": 0.8744441270828247, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.28125, "rewards/xmlcount_reward_func": 0.37959372997283936, "step": 2030 }, { "completion_length": 112.0625, "epoch": 1.086677367576244, "grad_norm": 1.446138620376587, "kl": 0.1571430116891861, "learning_rate": 4.0240636318182915e-06, "loss": 0.0063, "reward": 2.252812385559082, "reward_std": 0.5996181964874268, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4715625047683716, "step": 2031 }, { "completion_length": 132.0625, "epoch": 1.0872124130551097, "grad_norm": 1.3745098114013672, "kl": 0.21253350377082825, "learning_rate": 4.0228295335894665e-06, "loss": 0.0085, "reward": 2.1300625801086426, "reward_std": 0.6540379524230957, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4894375205039978, "step": 2032 }, { "completion_length": 129.09375, "epoch": 1.0877474585339755, "grad_norm": 131595592.0, "kl": 32186230.0, "learning_rate": 4.0215948450827335e-06, "loss": 1287449.125, "reward": 1.8171250820159912, "reward_std": 1.172187089920044, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.42650002241134644, "step": 2033 }, { "completion_length": 135.84375, "epoch": 1.088282504012841, "grad_norm": 0.7694296836853027, "kl": 0.15804851055145264, "learning_rate": 4.0203595667766815e-06, "loss": 0.0063, "reward": 1.91015625, "reward_std": 0.673533022403717, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.48828125, "step": 2034 }, { "completion_length": 122.3125, "epoch": 1.0888175494917067, "grad_norm": 0.8955711126327515, "kl": 0.1415339857339859, "learning_rate": 4.019123699150126e-06, "loss": 0.0057, "reward": 2.620875120162964, "reward_std": 1.083433985710144, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4802500009536743, "step": 2035 }, { "completion_length": 151.0625, "epoch": 1.0893525949705725, "grad_norm": 2.0701546669006348, "kl": 0.27756941318511963, "learning_rate": 4.017887242682115e-06, "loss": 0.0111, "reward": 1.4980000257492065, "reward_std": 0.7877849340438843, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.41987502574920654, "step": 2036 }, { "completion_length": 145.5625, "epoch": 1.0898876404494382, "grad_norm": 1.6184825897216797, "kl": 0.15264596045017242, "learning_rate": 4.0166501978519225e-06, "loss": 0.0061, "reward": 1.6189374923706055, "reward_std": 0.7411635518074036, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.44706249237060547, "step": 2037 }, { "completion_length": 109.90625, "epoch": 1.090422685928304, "grad_norm": 7819699712.0, "kl": 35268660.0, "learning_rate": 4.01541256513905e-06, "loss": 1410746.625, "reward": 2.1875, "reward_std": 0.5960342288017273, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.5, "step": 2038 }, { "completion_length": 123.3125, "epoch": 1.0909577314071697, "grad_norm": 3530898.0, "kl": 1121104.375, "learning_rate": 4.014174345023227e-06, "loss": 44844.1719, "reward": 2.1757187843322754, "reward_std": 0.749381959438324, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4569687843322754, "step": 2039 }, { "completion_length": 138.03125, "epoch": 1.0914927768860354, "grad_norm": 0.6905781626701355, "kl": 0.1198987364768982, "learning_rate": 4.012935537984414e-06, "loss": 0.0048, "reward": 1.3158438205718994, "reward_std": 0.47430896759033203, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.34709376096725464, "step": 2040 }, { "completion_length": 117.65625, "epoch": 1.092027822364901, "grad_norm": 89.44927215576172, "kl": 8.310189247131348, "learning_rate": 4.011696144502793e-06, "loss": 0.3324, "reward": 2.7547500133514404, "reward_std": 0.5498350858688354, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48912501335144043, "step": 2041 }, { "completion_length": 132.84375, "epoch": 1.0925628678437667, "grad_norm": 0.7995339035987854, "kl": 0.1471412628889084, "learning_rate": 4.010456165058779e-06, "loss": 0.0059, "reward": 2.2467498779296875, "reward_std": 0.880552351474762, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.41862499713897705, "step": 2042 }, { "completion_length": 128.09375, "epoch": 1.0930979133226324, "grad_norm": 0.43284299969673157, "kl": 0.1501408815383911, "learning_rate": 4.009215600133012e-06, "loss": 0.006, "reward": 1.9406561851501465, "reward_std": 0.47611498832702637, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.47190624475479126, "step": 2043 }, { "completion_length": 150.28125, "epoch": 1.0936329588014981, "grad_norm": 774.3026733398438, "kl": 2.425849199295044, "learning_rate": 4.0079744502063566e-06, "loss": 0.097, "reward": 1.36328125, "reward_std": 0.5862951874732971, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.44140625, "step": 2044 }, { "completion_length": 129.59375, "epoch": 1.0941680042803639, "grad_norm": 1.4334486722946167, "kl": 0.18668121099472046, "learning_rate": 4.0067327157599074e-06, "loss": 0.0075, "reward": 1.9140625, "reward_std": 0.7618712782859802, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4765625, "step": 2045 }, { "completion_length": 109.96875, "epoch": 1.0947030497592296, "grad_norm": 2.0747530460357666, "kl": 0.17062978446483612, "learning_rate": 4.005490397274986e-06, "loss": 0.0068, "reward": 2.7007501125335693, "reward_std": 0.5665051341056824, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4976249933242798, "step": 2046 }, { "completion_length": 136.90625, "epoch": 1.0952380952380953, "grad_norm": 1.029995322227478, "kl": 0.1442720741033554, "learning_rate": 4.004247495233137e-06, "loss": 0.0058, "reward": 2.1224687099456787, "reward_std": 0.835096001625061, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4505937397480011, "step": 2047 }, { "completion_length": 126.4375, "epoch": 1.0957731407169609, "grad_norm": 0.9868436455726624, "kl": 0.13396736979484558, "learning_rate": 4.003004010116134e-06, "loss": 0.0054, "reward": 1.8149062395095825, "reward_std": 0.8513200283050537, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4867812395095825, "step": 2048 }, { "completion_length": 145.3125, "epoch": 1.0963081861958266, "grad_norm": 1.8750213384628296, "kl": 0.2621957063674927, "learning_rate": 4.001759942405974e-06, "loss": 0.0105, "reward": 1.091406226158142, "reward_std": 0.7232438921928406, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.3726562559604645, "step": 2049 }, { "completion_length": 143.34375, "epoch": 1.0968432316746923, "grad_norm": 0.6752844452857971, "kl": 0.16040769219398499, "learning_rate": 4.000515292584883e-06, "loss": 0.0064, "reward": 1.83984375, "reward_std": 1.0199174880981445, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.46484375, "step": 2050 }, { "completion_length": 124.4375, "epoch": 1.097378277153558, "grad_norm": 43151.58984375, "kl": 145.22979736328125, "learning_rate": 3.9992700611353115e-06, "loss": 5.8092, "reward": 1.402437448501587, "reward_std": 0.3372722268104553, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4805625081062317, "step": 2051 }, { "completion_length": 147.34375, "epoch": 1.0979133226324238, "grad_norm": 0.7310811877250671, "kl": 0.1875457763671875, "learning_rate": 3.998024248539934e-06, "loss": 0.0075, "reward": 1.3327500820159912, "reward_std": 0.7416847944259644, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.265625, "rewards/xmlcount_reward_func": 0.37962502241134644, "step": 2052 }, { "completion_length": 145.15625, "epoch": 1.0984483681112895, "grad_norm": 1.0163710117340088, "kl": 0.15123185515403748, "learning_rate": 3.996777855281651e-06, "loss": 0.006, "reward": 1.788156270980835, "reward_std": 0.6113109588623047, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.3975312411785126, "step": 2053 }, { "completion_length": 152.53125, "epoch": 1.098983413590155, "grad_norm": 2.4226009845733643, "kl": 0.16226285696029663, "learning_rate": 3.99553088184359e-06, "loss": 0.0065, "reward": 1.6635313034057617, "reward_std": 1.0910428762435913, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.41353124380111694, "step": 2054 }, { "completion_length": 153.75, "epoch": 1.0995184590690208, "grad_norm": 0.6399405002593994, "kl": 0.13334129750728607, "learning_rate": 3.994283328709101e-06, "loss": 0.0053, "reward": 1.7421875, "reward_std": 0.7916345000267029, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.4296875, "step": 2055 }, { "completion_length": 133.46875, "epoch": 1.1000535045478865, "grad_norm": 1.255475401878357, "kl": 0.20930661261081696, "learning_rate": 3.993035196361758e-06, "loss": 0.0084, "reward": 2.2086875438690186, "reward_std": 0.7761090993881226, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4899374842643738, "step": 2056 }, { "completion_length": 118.53125, "epoch": 1.1005885500267523, "grad_norm": 1.1806719303131104, "kl": 0.2055903822183609, "learning_rate": 3.991786485285365e-06, "loss": 0.0082, "reward": 1.9765625, "reward_std": 0.4785710573196411, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4765625, "step": 2057 }, { "completion_length": 126.75, "epoch": 1.101123595505618, "grad_norm": 1.2507028579711914, "kl": 0.20228470861911774, "learning_rate": 3.990537195963942e-06, "loss": 0.0081, "reward": 2.6738123893737793, "reward_std": 0.6078881025314331, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.45506250858306885, "step": 2058 }, { "completion_length": 135.1875, "epoch": 1.1016586409844837, "grad_norm": 0.9571189284324646, "kl": 0.2197115421295166, "learning_rate": 3.98928732888174e-06, "loss": 0.0088, "reward": 1.295968770980835, "reward_std": 0.5618640780448914, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4365937113761902, "step": 2059 }, { "completion_length": 161.34375, "epoch": 1.1021936864633495, "grad_norm": 1.975777268409729, "kl": 0.16307617723941803, "learning_rate": 3.98803688452323e-06, "loss": 0.0065, "reward": 1.0614687204360962, "reward_std": 0.6291637420654297, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.3739687204360962, "step": 2060 }, { "completion_length": 146.59375, "epoch": 1.102728731942215, "grad_norm": 1.3855788707733154, "kl": 0.23516514897346497, "learning_rate": 3.9867858633731084e-06, "loss": 0.0094, "reward": 1.5159687995910645, "reward_std": 0.7515142560005188, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4534687399864197, "step": 2061 }, { "completion_length": 144.21875, "epoch": 1.1032637774210807, "grad_norm": 1.7737778425216675, "kl": 0.2165098786354065, "learning_rate": 3.985534265916297e-06, "loss": 0.0087, "reward": 1.8095312118530273, "reward_std": 0.5815024375915527, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46578124165534973, "step": 2062 }, { "completion_length": 157.46875, "epoch": 1.1037988228999465, "grad_norm": 0.745543897151947, "kl": 0.11534002423286438, "learning_rate": 3.984282092637937e-06, "loss": 0.0046, "reward": 1.4279375076293945, "reward_std": 0.9616162776947021, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.41231250762939453, "step": 2063 }, { "completion_length": 109.875, "epoch": 1.1043338683788122, "grad_norm": 0.8510019183158875, "kl": 0.26068347692489624, "learning_rate": 3.983029344023394e-06, "loss": 0.0104, "reward": 2.1480937004089355, "reward_std": 0.8170900344848633, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4918437600135803, "step": 2064 }, { "completion_length": 108.28125, "epoch": 1.104868913857678, "grad_norm": 0.7783153653144836, "kl": 0.16627389192581177, "learning_rate": 3.981776020558259e-06, "loss": 0.0067, "reward": 3.15625, "reward_std": 0.5078685283660889, "rewards/correctness_reward_func": 1.6875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.5, "step": 2065 }, { "completion_length": 152.03125, "epoch": 1.1054039593365437, "grad_norm": 0.9926010966300964, "kl": 0.15692013502120972, "learning_rate": 3.980522122728344e-06, "loss": 0.0063, "reward": 1.625, "reward_std": 0.787862241268158, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.46875, "step": 2066 }, { "completion_length": 131.34375, "epoch": 1.1059390048154094, "grad_norm": 0.5327199697494507, "kl": 0.16761472821235657, "learning_rate": 3.979267651019684e-06, "loss": 0.0067, "reward": 2.1455001831054688, "reward_std": 0.8014929294586182, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4267500042915344, "step": 2067 }, { "completion_length": 122.125, "epoch": 1.106474050294275, "grad_norm": 0.8977723121643066, "kl": 0.1987336277961731, "learning_rate": 3.978012605918535e-06, "loss": 0.0079, "reward": 2.2533750534057617, "reward_std": 0.9924519658088684, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47212499380111694, "step": 2068 }, { "completion_length": 124.96875, "epoch": 1.1070090957731407, "grad_norm": 0.9951732754707336, "kl": 0.1600019335746765, "learning_rate": 3.9767569879113775e-06, "loss": 0.0064, "reward": 2.680500030517578, "reward_std": 0.7292166948318481, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.47737500071525574, "step": 2069 }, { "completion_length": 134.21875, "epoch": 1.1075441412520064, "grad_norm": 0.7333360314369202, "kl": 0.1940610259771347, "learning_rate": 3.975500797484913e-06, "loss": 0.0078, "reward": 2.78125, "reward_std": 0.5737333297729492, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.5, "step": 2070 }, { "completion_length": 140.1875, "epoch": 1.1080791867308721, "grad_norm": 2.3458478450775146, "kl": 0.22602902352809906, "learning_rate": 3.974244035126066e-06, "loss": 0.009, "reward": 1.4682812690734863, "reward_std": 0.8476097583770752, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.43703123927116394, "step": 2071 }, { "completion_length": 122.96875, "epoch": 1.1086142322097379, "grad_norm": 1.5156461000442505, "kl": 0.23555979132652283, "learning_rate": 3.97298670132198e-06, "loss": 0.0094, "reward": 2.1875, "reward_std": 0.36222195625305176, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.5, "step": 2072 }, { "completion_length": 118.375, "epoch": 1.1091492776886036, "grad_norm": 0.9978967308998108, "kl": 0.19411805272102356, "learning_rate": 3.971728796560023e-06, "loss": 0.0078, "reward": 1.909749984741211, "reward_std": 0.7281877994537354, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47224998474121094, "step": 2073 }, { "completion_length": 157.84375, "epoch": 1.1096843231674693, "grad_norm": 0.3862416446208954, "kl": 0.15205542743206024, "learning_rate": 3.97047032132778e-06, "loss": 0.0061, "reward": 1.955468773841858, "reward_std": 0.6531025171279907, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4398437440395355, "step": 2074 }, { "completion_length": 125.09375, "epoch": 1.1102193686463349, "grad_norm": 1.1329008340835571, "kl": 0.18722400069236755, "learning_rate": 3.969211276113064e-06, "loss": 0.0075, "reward": 2.5904064178466797, "reward_std": 0.912235677242279, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.48103123903274536, "step": 2075 }, { "completion_length": 129.71875, "epoch": 1.1107544141252006, "grad_norm": 2.6196064949035645, "kl": 0.17949000000953674, "learning_rate": 3.967951661403904e-06, "loss": 0.0072, "reward": 1.8203125, "reward_std": 0.7188053131103516, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4765625, "step": 2076 }, { "completion_length": 142.5625, "epoch": 1.1112894596040663, "grad_norm": 2.517756223678589, "kl": 0.16671603918075562, "learning_rate": 3.966691477688549e-06, "loss": 0.0067, "reward": 1.756812572479248, "reward_std": 0.7100247144699097, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47556251287460327, "step": 2077 }, { "completion_length": 115.6875, "epoch": 1.111824505082932, "grad_norm": 2.8890652656555176, "kl": 0.18400105834007263, "learning_rate": 3.965430725455472e-06, "loss": 0.0074, "reward": 2.319187641143799, "reward_std": 0.6081588268280029, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4910624921321869, "step": 2078 }, { "completion_length": 147.3125, "epoch": 1.1123595505617978, "grad_norm": 3.141923427581787, "kl": 0.20786696672439575, "learning_rate": 3.964169405193364e-06, "loss": 0.0083, "reward": 2.05859375, "reward_std": 1.022810697555542, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.43359375, "step": 2079 }, { "completion_length": 157.09375, "epoch": 1.1128945960406635, "grad_norm": 0.5584728121757507, "kl": 0.11088134348392487, "learning_rate": 3.962907517391138e-06, "loss": 0.0044, "reward": 1.49609375, "reward_std": 0.47998547554016113, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.48046875, "step": 2080 }, { "completion_length": 141.03125, "epoch": 1.113429641519529, "grad_norm": 0.7761791348457336, "kl": 0.1408567577600479, "learning_rate": 3.961645062537926e-06, "loss": 0.0056, "reward": 1.976906180381775, "reward_std": 0.7910687327384949, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4456562399864197, "step": 2081 }, { "completion_length": 151.625, "epoch": 1.1139646869983948, "grad_norm": 0.9938198328018188, "kl": 0.18569499254226685, "learning_rate": 3.960382041123076e-06, "loss": 0.0074, "reward": 1.78125, "reward_std": 0.8927829265594482, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46875, "step": 2082 }, { "completion_length": 159.875, "epoch": 1.1144997324772605, "grad_norm": 1.1130883693695068, "kl": 0.20375093817710876, "learning_rate": 3.959118453636164e-06, "loss": 0.0082, "reward": 1.5970938205718994, "reward_std": 1.0350563526153564, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.40959376096725464, "step": 2083 }, { "completion_length": 121.96875, "epoch": 1.1150347779561263, "grad_norm": 1.167022466659546, "kl": 0.23945896327495575, "learning_rate": 3.957854300566979e-06, "loss": 0.0096, "reward": 1.5848125219345093, "reward_std": 0.7438939809799194, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4754374921321869, "step": 2084 }, { "completion_length": 119.6875, "epoch": 1.115569823434992, "grad_norm": 0.7273324728012085, "kl": 0.1576654314994812, "learning_rate": 3.95658958240553e-06, "loss": 0.0063, "reward": 2.3519375324249268, "reward_std": 0.8579714298248291, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.49256250262260437, "step": 2085 }, { "completion_length": 101.625, "epoch": 1.1161048689138577, "grad_norm": 0.9775776863098145, "kl": 0.2299412488937378, "learning_rate": 3.955324299642047e-06, "loss": 0.0092, "reward": 1.9029061794281006, "reward_std": 0.5904887318611145, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.49665623903274536, "step": 2086 }, { "completion_length": 134.8125, "epoch": 1.1166399143927235, "grad_norm": 1.194732666015625, "kl": 0.19529138505458832, "learning_rate": 3.954058452766979e-06, "loss": 0.0078, "reward": 2.757093906402588, "reward_std": 0.7010200023651123, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46021875739097595, "step": 2087 }, { "completion_length": 139.78125, "epoch": 1.117174959871589, "grad_norm": 2.477210521697998, "kl": 0.36918237805366516, "learning_rate": 3.952792042270989e-06, "loss": 0.0148, "reward": 1.4619687795639038, "reward_std": 0.8420151472091675, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4463437497615814, "step": 2088 }, { "completion_length": 104.21875, "epoch": 1.1177100053504547, "grad_norm": 0.9086970090866089, "kl": 0.18199793994426727, "learning_rate": 3.951525068644965e-06, "loss": 0.0073, "reward": 3.0859999656677246, "reward_std": 0.5008748173713684, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.492249995470047, "step": 2089 }, { "completion_length": 131.875, "epoch": 1.1182450508293205, "grad_norm": 1.077973484992981, "kl": 0.16243410110473633, "learning_rate": 3.950257532380009e-06, "loss": 0.0065, "reward": 1.565906286239624, "reward_std": 0.6280826330184937, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.44090625643730164, "step": 2090 }, { "completion_length": 110.125, "epoch": 1.1187800963081862, "grad_norm": 1029.739501953125, "kl": 2.4378976821899414, "learning_rate": 3.948989433967444e-06, "loss": 0.0975, "reward": 1.8984687328338623, "reward_std": 0.664604663848877, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4922187626361847, "step": 2091 }, { "completion_length": 152.1875, "epoch": 1.119315141787052, "grad_norm": 0.8092218637466431, "kl": 0.11917205154895782, "learning_rate": 3.947720773898808e-06, "loss": 0.0048, "reward": 1.4837499856948853, "reward_std": 0.9502888321876526, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.37437501549720764, "step": 2092 }, { "completion_length": 126.46875, "epoch": 1.1198501872659177, "grad_norm": 0.4853752553462982, "kl": 0.1635512411594391, "learning_rate": 3.946451552665857e-06, "loss": 0.0065, "reward": 2.2333436012268066, "reward_std": 0.8545035719871521, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4677187502384186, "step": 2093 }, { "completion_length": 125.3125, "epoch": 1.1203852327447834, "grad_norm": 4.508914470672607, "kl": 0.31211423873901367, "learning_rate": 3.945181770760566e-06, "loss": 0.0125, "reward": 1.938249945640564, "reward_std": 0.9033689498901367, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.46950000524520874, "step": 2094 }, { "completion_length": 148.78125, "epoch": 1.120920278223649, "grad_norm": 0.6146669983863831, "kl": 0.13997644186019897, "learning_rate": 3.943911428675128e-06, "loss": 0.0056, "reward": 2.047874927520752, "reward_std": 0.6068066358566284, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4541250169277191, "step": 2095 }, { "completion_length": 115.78125, "epoch": 1.1214553237025147, "grad_norm": 1.2571589946746826, "kl": 0.2342359572649002, "learning_rate": 3.942640526901951e-06, "loss": 0.0094, "reward": 1.7832187414169312, "reward_std": 0.6931269764900208, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.43946874141693115, "step": 2096 }, { "completion_length": 138.5, "epoch": 1.1219903691813804, "grad_norm": 0.8460745811462402, "kl": 0.2106955349445343, "learning_rate": 3.94136906593366e-06, "loss": 0.0084, "reward": 1.630312442779541, "reward_std": 0.8426315784454346, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4584375023841858, "step": 2097 }, { "completion_length": 145.5625, "epoch": 1.1225254146602461, "grad_norm": 0.9237003922462463, "kl": 0.21086755394935608, "learning_rate": 3.9400970462631e-06, "loss": 0.0084, "reward": 1.721093773841858, "reward_std": 0.6512446403503418, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.3929687440395355, "step": 2098 }, { "completion_length": 137.8125, "epoch": 1.1230604601391119, "grad_norm": 6.858280658721924, "kl": 0.41672325134277344, "learning_rate": 3.938824468383328e-06, "loss": 0.0167, "reward": 1.911656141281128, "reward_std": 0.887399435043335, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4116562604904175, "step": 2099 }, { "completion_length": 134.9375, "epoch": 1.1235955056179776, "grad_norm": 1.1256937980651855, "kl": 0.1607152223587036, "learning_rate": 3.937551332787621e-06, "loss": 0.0064, "reward": 1.812343716621399, "reward_std": 0.7139804363250732, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4373437464237213, "step": 2100 }, { "completion_length": 133.25, "epoch": 1.1241305510968433, "grad_norm": 1.0715662240982056, "kl": 0.1797800064086914, "learning_rate": 3.936277639969469e-06, "loss": 0.0072, "reward": 1.7979375123977661, "reward_std": 0.43169718980789185, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4385625123977661, "step": 2101 }, { "completion_length": 130.78125, "epoch": 1.1246655965757089, "grad_norm": 47272.95703125, "kl": 235.75425720214844, "learning_rate": 3.9350033904225824e-06, "loss": 9.4302, "reward": 1.796875, "reward_std": 0.4882320761680603, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 2102 }, { "completion_length": 119.1875, "epoch": 1.1252006420545746, "grad_norm": 2.2763426303863525, "kl": 0.24061930179595947, "learning_rate": 3.9337285846408826e-06, "loss": 0.0096, "reward": 2.2965312004089355, "reward_std": 0.6508442163467407, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4840312600135803, "step": 2103 }, { "completion_length": 141.71875, "epoch": 1.1257356875334403, "grad_norm": 1.2465200424194336, "kl": 0.1794959157705307, "learning_rate": 3.9324532231185095e-06, "loss": 0.0072, "reward": 1.7653436660766602, "reward_std": 0.6131960153579712, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4528437554836273, "step": 2104 }, { "completion_length": 144.90625, "epoch": 1.126270733012306, "grad_norm": 1.0063713788986206, "kl": 0.13590815663337708, "learning_rate": 3.931177306349818e-06, "loss": 0.0054, "reward": 1.009218692779541, "reward_std": 0.4548106789588928, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4154687523841858, "step": 2105 }, { "completion_length": 123.1875, "epoch": 1.1268057784911718, "grad_norm": 1.669037938117981, "kl": 0.25763580203056335, "learning_rate": 3.929900834829376e-06, "loss": 0.0103, "reward": 2.216156482696533, "reward_std": 1.1215596199035645, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4505312442779541, "step": 2106 }, { "completion_length": 120.84375, "epoch": 1.1273408239700375, "grad_norm": 1.1103752851486206, "kl": 0.22591502964496613, "learning_rate": 3.928623809051972e-06, "loss": 0.009, "reward": 2.6579062938690186, "reward_std": 0.8368791937828064, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4860312342643738, "step": 2107 }, { "completion_length": 124.125, "epoch": 1.127875869448903, "grad_norm": 6.254481315612793, "kl": 0.18841516971588135, "learning_rate": 3.927346229512602e-06, "loss": 0.0075, "reward": 2.5727500915527344, "reward_std": 1.0328288078308105, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4790000021457672, "step": 2108 }, { "completion_length": 117.71875, "epoch": 1.1284109149277688, "grad_norm": 1.0002511739730835, "kl": 0.19626522064208984, "learning_rate": 3.926068096706481e-06, "loss": 0.0079, "reward": 1.984375, "reward_std": 0.6514768600463867, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.5, "step": 2109 }, { "completion_length": 155.4375, "epoch": 1.1289459604066345, "grad_norm": 0.8091625571250916, "kl": 0.13247671723365784, "learning_rate": 3.92478941112904e-06, "loss": 0.0053, "reward": 2.003781318664551, "reward_std": 1.1145238876342773, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.394406259059906, "step": 2110 }, { "completion_length": 151.6875, "epoch": 1.1294810058855003, "grad_norm": 0.7549888491630554, "kl": 0.15029335021972656, "learning_rate": 3.92351017327592e-06, "loss": 0.006, "reward": 1.441999912261963, "reward_std": 1.0365068912506104, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.37950000166893005, "step": 2111 }, { "completion_length": 132.1875, "epoch": 1.130016051364366, "grad_norm": 0.9106602668762207, "kl": 0.1411239057779312, "learning_rate": 3.922230383642979e-06, "loss": 0.0056, "reward": 1.6139062643051147, "reward_std": 0.6756649613380432, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47328126430511475, "step": 2112 }, { "completion_length": 152.96875, "epoch": 1.1305510968432317, "grad_norm": 0.6411001682281494, "kl": 0.13515198230743408, "learning_rate": 3.920950042726287e-06, "loss": 0.0054, "reward": 1.2370312213897705, "reward_std": 0.5619482398033142, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4245312511920929, "step": 2113 }, { "completion_length": 128.71875, "epoch": 1.1310861423220975, "grad_norm": 6.502212047576904, "kl": 0.1333048939704895, "learning_rate": 3.91966915102213e-06, "loss": 0.0053, "reward": 2.231156349182129, "reward_std": 0.7757799625396729, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.43428125977516174, "step": 2114 }, { "completion_length": 126.0625, "epoch": 1.131621187800963, "grad_norm": 1.4366326332092285, "kl": 0.24858340620994568, "learning_rate": 3.918387709027005e-06, "loss": 0.0099, "reward": 1.6705623865127563, "reward_std": 0.7000455856323242, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4518125057220459, "step": 2115 }, { "completion_length": 145.0625, "epoch": 1.1321562332798287, "grad_norm": 1.2684447765350342, "kl": 0.1489335298538208, "learning_rate": 3.9171057172376245e-06, "loss": 0.006, "reward": 1.878749966621399, "reward_std": 0.7796025276184082, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4099999964237213, "step": 2116 }, { "completion_length": 124.40625, "epoch": 1.1326912787586945, "grad_norm": 1680.839111328125, "kl": 381.9397888183594, "learning_rate": 3.9158231761509125e-06, "loss": 15.2776, "reward": 2.426281213760376, "reward_std": 0.9475796222686768, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.47315624356269836, "step": 2117 }, { "completion_length": 136.3125, "epoch": 1.1332263242375602, "grad_norm": 1.9930446147918701, "kl": 0.2580249011516571, "learning_rate": 3.914540086264006e-06, "loss": 0.0103, "reward": 2.130593776702881, "reward_std": 1.1189618110656738, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.45871874690055847, "step": 2118 }, { "completion_length": 129.25, "epoch": 1.133761369716426, "grad_norm": 2.5244786739349365, "kl": 0.15202349424362183, "learning_rate": 3.913256448074257e-06, "loss": 0.0061, "reward": 2.5645313262939453, "reward_std": 0.9800781011581421, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.45515626668930054, "step": 2119 }, { "completion_length": 152.25, "epoch": 1.1342964151952917, "grad_norm": 1.0197163820266724, "kl": 0.1519971489906311, "learning_rate": 3.911972262079225e-06, "loss": 0.0061, "reward": 1.6439062356948853, "reward_std": 0.9391777515411377, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.44078126549720764, "step": 2120 }, { "completion_length": 115.15625, "epoch": 1.1348314606741572, "grad_norm": 0.7008668184280396, "kl": 0.1872245967388153, "learning_rate": 3.910687528776688e-06, "loss": 0.0075, "reward": 2.63671875, "reward_std": 0.6180354356765747, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.49609375, "step": 2121 }, { "completion_length": 143.71875, "epoch": 1.135366506153023, "grad_norm": 40998.890625, "kl": 179.97329711914062, "learning_rate": 3.909402248664633e-06, "loss": 7.1989, "reward": 1.676281213760376, "reward_std": 0.8657727241516113, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.45753124356269836, "step": 2122 }, { "completion_length": 122.28125, "epoch": 1.1359015516318887, "grad_norm": 4.600080966949463, "kl": 0.2490220069885254, "learning_rate": 3.908116422241258e-06, "loss": 0.01, "reward": 2.5133438110351562, "reward_std": 0.8862427473068237, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4820937514305115, "step": 2123 }, { "completion_length": 119.6875, "epoch": 1.1364365971107544, "grad_norm": 1.093560814857483, "kl": 0.29244351387023926, "learning_rate": 3.906830050004975e-06, "loss": 0.0117, "reward": 2.5687813758850098, "reward_std": 0.69917893409729, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4750312566757202, "step": 2124 }, { "completion_length": 153.40625, "epoch": 1.1369716425896201, "grad_norm": 1.6201239824295044, "kl": 0.1890673190355301, "learning_rate": 3.9055431324544055e-06, "loss": 0.0076, "reward": 1.8820624351501465, "reward_std": 0.9219846129417419, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.42893749475479126, "step": 2125 }, { "completion_length": 142.15625, "epoch": 1.1375066880684859, "grad_norm": 67895.8671875, "kl": 182.0020294189453, "learning_rate": 3.904255670088384e-06, "loss": 7.2801, "reward": 2.2923126220703125, "reward_std": 0.9570159912109375, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.38606250286102295, "step": 2126 }, { "completion_length": 160.78125, "epoch": 1.1380417335473516, "grad_norm": 1.6693451404571533, "kl": 0.27983081340789795, "learning_rate": 3.9029676634059565e-06, "loss": 0.0112, "reward": 1.6652499437332153, "reward_std": 0.7126107215881348, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4308750033378601, "step": 2127 }, { "completion_length": 166.3125, "epoch": 1.1385767790262173, "grad_norm": 1.5335601568222046, "kl": 0.17342443764209747, "learning_rate": 3.901679112906378e-06, "loss": 0.0069, "reward": 1.4163436889648438, "reward_std": 0.9444534182548523, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4007187485694885, "step": 2128 }, { "completion_length": 112.25, "epoch": 1.1391118245050829, "grad_norm": 3.5749733448028564, "kl": 0.22459420561790466, "learning_rate": 3.900390019089114e-06, "loss": 0.009, "reward": 2.4352500438690186, "reward_std": 0.905683159828186, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4821249842643738, "step": 2129 }, { "completion_length": 166.875, "epoch": 1.1396468699839486, "grad_norm": 1.000090479850769, "kl": 0.11059807240962982, "learning_rate": 3.899100382453846e-06, "loss": 0.0044, "reward": 1.323625087738037, "reward_std": 1.0493611097335815, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.28125, "rewards/xmlcount_reward_func": 0.37050002813339233, "step": 2130 }, { "completion_length": 142.65625, "epoch": 1.1401819154628143, "grad_norm": 1.4621163606643677, "kl": 0.16791090369224548, "learning_rate": 3.897810203500457e-06, "loss": 0.0067, "reward": 1.607968807220459, "reward_std": 0.4520484209060669, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4673437476158142, "step": 2131 }, { "completion_length": 125.5, "epoch": 1.14071696094168, "grad_norm": 1079210.125, "kl": 2635.305419921875, "learning_rate": 3.896519482729047e-06, "loss": 105.4122, "reward": 1.758062481880188, "reward_std": 0.6842887997627258, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.476812481880188, "step": 2132 }, { "completion_length": 139.4375, "epoch": 1.1412520064205458, "grad_norm": 0.8262710571289062, "kl": 0.16034674644470215, "learning_rate": 3.8952282206399265e-06, "loss": 0.0064, "reward": 2.0556564331054688, "reward_std": 1.089639663696289, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4462812542915344, "step": 2133 }, { "completion_length": 107.5, "epoch": 1.1417870518994115, "grad_norm": 0.8948279023170471, "kl": 0.21268512308597565, "learning_rate": 3.8939364177336105e-06, "loss": 0.0085, "reward": 3.151656150817871, "reward_std": 0.5751640200614929, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47978124022483826, "step": 2134 }, { "completion_length": 144.09375, "epoch": 1.142322097378277, "grad_norm": 0.47641414403915405, "kl": 0.13456997275352478, "learning_rate": 3.892644074510828e-06, "loss": 0.0054, "reward": 1.1921563148498535, "reward_std": 0.6484130620956421, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.37965625524520874, "step": 2135 }, { "completion_length": 89.46875, "epoch": 1.1428571428571428, "grad_norm": 1.0658328533172607, "kl": 0.20146644115447998, "learning_rate": 3.891351191472516e-06, "loss": 0.0081, "reward": 2.6058125495910645, "reward_std": 0.425923228263855, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4808124899864197, "step": 2136 }, { "completion_length": 130.4375, "epoch": 1.1433921883360085, "grad_norm": 1.4536346197128296, "kl": 0.21407824754714966, "learning_rate": 3.8900577691198186e-06, "loss": 0.0086, "reward": 1.9303125143051147, "reward_std": 0.7485568523406982, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.49281251430511475, "step": 2137 }, { "completion_length": 129.3125, "epoch": 1.1439272338148743, "grad_norm": 0.5422364473342896, "kl": 0.16118589043617249, "learning_rate": 3.888763807954095e-06, "loss": 0.0064, "reward": 2.0960311889648438, "reward_std": 0.6837825775146484, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4397812485694885, "step": 2138 }, { "completion_length": 155.375, "epoch": 1.14446227929374, "grad_norm": 0.7160618901252747, "kl": 0.14360979199409485, "learning_rate": 3.8874693084769055e-06, "loss": 0.0057, "reward": 1.4064061641693115, "reward_std": 0.7464972734451294, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4064062237739563, "step": 2139 }, { "completion_length": 139.25, "epoch": 1.1449973247726057, "grad_norm": 1.305967092514038, "kl": 0.14457058906555176, "learning_rate": 3.886174271190024e-06, "loss": 0.0058, "reward": 2.0815000534057617, "reward_std": 0.8923760652542114, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.45649999380111694, "step": 2140 }, { "completion_length": 141.25, "epoch": 1.1455323702514715, "grad_norm": 1.0435572862625122, "kl": 0.16514715552330017, "learning_rate": 3.884878696595433e-06, "loss": 0.0066, "reward": 2.074031352996826, "reward_std": 0.7736772894859314, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4646562337875366, "step": 2141 }, { "completion_length": 130.625, "epoch": 1.146067415730337, "grad_norm": 1.48379647731781, "kl": 0.18444925546646118, "learning_rate": 3.883582585195321e-06, "loss": 0.0074, "reward": 3.1288437843322754, "reward_std": 0.705875039100647, "rewards/correctness_reward_func": 1.6875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.488218754529953, "step": 2142 }, { "completion_length": 103.90625, "epoch": 1.1466024612092027, "grad_norm": 1.9890574216842651, "kl": 0.24461208283901215, "learning_rate": 3.882285937492087e-06, "loss": 0.0098, "reward": 2.032249927520752, "reward_std": 0.519929051399231, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46974998712539673, "step": 2143 }, { "completion_length": 140.1875, "epoch": 1.1471375066880685, "grad_norm": 2.4027092456817627, "kl": 0.2940150499343872, "learning_rate": 3.880988753988335e-06, "loss": 0.0118, "reward": 2.227250099182129, "reward_std": 1.1805979013442993, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.43037500977516174, "step": 2144 }, { "completion_length": 139.1875, "epoch": 1.1476725521669342, "grad_norm": 1.4925590753555298, "kl": 0.24481844902038574, "learning_rate": 3.8796910351868776e-06, "loss": 0.0098, "reward": 1.2330937385559082, "reward_std": 0.6302504539489746, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4518437385559082, "step": 2145 }, { "completion_length": 122.09375, "epoch": 1.1482075976458, "grad_norm": 2.113617181777954, "kl": 0.15253612399101257, "learning_rate": 3.878392781590738e-06, "loss": 0.0061, "reward": 1.2410937547683716, "reward_std": 0.48644599318504333, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4442187547683716, "step": 2146 }, { "completion_length": 105.1875, "epoch": 1.1487426431246657, "grad_norm": 0.8568593859672546, "kl": 0.1659141331911087, "learning_rate": 3.877093993703142e-06, "loss": 0.0066, "reward": 2.5962812900543213, "reward_std": 1.0323905944824219, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4869062304496765, "step": 2147 }, { "completion_length": 111.3125, "epoch": 1.1492776886035312, "grad_norm": 1.0260069370269775, "kl": 0.18912240862846375, "learning_rate": 3.875794672027525e-06, "loss": 0.0076, "reward": 2.235968828201294, "reward_std": 0.5513600707054138, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.47034376859664917, "step": 2148 }, { "completion_length": 147.0625, "epoch": 1.149812734082397, "grad_norm": 1.0491175651550293, "kl": 0.20394840836524963, "learning_rate": 3.874494817067529e-06, "loss": 0.0082, "reward": 1.8937500715255737, "reward_std": 1.1600356101989746, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.44062498211860657, "step": 2149 }, { "completion_length": 131.71875, "epoch": 1.1503477795612627, "grad_norm": 2.904141426086426, "kl": 0.3203774690628052, "learning_rate": 3.873194429327003e-06, "loss": 0.0128, "reward": 2.203312397003174, "reward_std": 0.7393868565559387, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4689375162124634, "step": 2150 }, { "completion_length": 103.8125, "epoch": 1.1508828250401284, "grad_norm": 1.4521093368530273, "kl": 0.18769721686840057, "learning_rate": 3.871893509310003e-06, "loss": 0.0075, "reward": 2.886812686920166, "reward_std": 0.7831661701202393, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4805625081062317, "step": 2151 }, { "completion_length": 127.21875, "epoch": 1.1514178705189941, "grad_norm": 0.8303498029708862, "kl": 0.14267443120479584, "learning_rate": 3.87059205752079e-06, "loss": 0.0057, "reward": 2.0390625, "reward_std": 0.6890943050384521, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4765625, "step": 2152 }, { "completion_length": 149.5, "epoch": 1.1519529159978599, "grad_norm": 7.09489107131958, "kl": 0.17147402465343475, "learning_rate": 3.86929007446383e-06, "loss": 0.0069, "reward": 1.4172500371932983, "reward_std": 0.6076471209526062, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.38599997758865356, "step": 2153 }, { "completion_length": 129.625, "epoch": 1.1524879614767256, "grad_norm": 14.577017784118652, "kl": 0.34663715958595276, "learning_rate": 3.867987560643799e-06, "loss": 0.0139, "reward": 2.113187551498413, "reward_std": 0.9045143723487854, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4569374918937683, "step": 2154 }, { "completion_length": 120.375, "epoch": 1.1530230069555913, "grad_norm": 0.9597927331924438, "kl": 0.1626420021057129, "learning_rate": 3.866684516565575e-06, "loss": 0.0065, "reward": 2.359375, "reward_std": 0.5481458306312561, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.5, "step": 2155 }, { "completion_length": 130.09375, "epoch": 1.1535580524344569, "grad_norm": 1.4950037002563477, "kl": 0.32976311445236206, "learning_rate": 3.865380942734243e-06, "loss": 0.0132, "reward": 2.2934999465942383, "reward_std": 0.6260391473770142, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.46537500619888306, "step": 2156 }, { "completion_length": 140.28125, "epoch": 1.1540930979133226, "grad_norm": 2.122612237930298, "kl": 0.18973849713802338, "learning_rate": 3.864076839655093e-06, "loss": 0.0076, "reward": 1.3165936470031738, "reward_std": 0.7128181457519531, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.4259687662124634, "step": 2157 }, { "completion_length": 137.65625, "epoch": 1.1546281433921883, "grad_norm": 1.3894745111465454, "kl": 0.1920897364616394, "learning_rate": 3.862772207833621e-06, "loss": 0.0077, "reward": 1.4141874313354492, "reward_std": 0.5915348529815674, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.414187490940094, "step": 2158 }, { "completion_length": 107.125, "epoch": 1.155163188871054, "grad_norm": 1.6135499477386475, "kl": 0.2598809003829956, "learning_rate": 3.861467047775527e-06, "loss": 0.0104, "reward": 2.067312479019165, "reward_std": 0.8525208830833435, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4891875088214874, "step": 2159 }, { "completion_length": 125.875, "epoch": 1.1556982343499198, "grad_norm": 1.0602669715881348, "kl": 0.17100846767425537, "learning_rate": 3.860161359986716e-06, "loss": 0.0068, "reward": 2.4510624408721924, "reward_std": 0.7642943263053894, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46668750047683716, "step": 2160 }, { "completion_length": 153.40625, "epoch": 1.1562332798287855, "grad_norm": 1.0456942319869995, "kl": 0.17430584132671356, "learning_rate": 3.858855144973298e-06, "loss": 0.007, "reward": 1.097749948501587, "reward_std": 0.794395923614502, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.25, "rewards/xmlcount_reward_func": 0.3321250081062317, "step": 2161 }, { "completion_length": 146.5, "epoch": 1.156768325307651, "grad_norm": 2.6947805881500244, "kl": 0.20395535230636597, "learning_rate": 3.857548403241587e-06, "loss": 0.0082, "reward": 1.5265936851501465, "reward_std": 1.0417711734771729, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.41721874475479126, "step": 2162 }, { "completion_length": 139.3125, "epoch": 1.1573033707865168, "grad_norm": 1.0221821069717407, "kl": 0.17748504877090454, "learning_rate": 3.8562411352981e-06, "loss": 0.0071, "reward": 1.7000312805175781, "reward_std": 0.5788468718528748, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.48128125071525574, "step": 2163 }, { "completion_length": 120.90625, "epoch": 1.1578384162653825, "grad_norm": 0.555193305015564, "kl": 0.2227010577917099, "learning_rate": 3.854933341649559e-06, "loss": 0.0089, "reward": 2.927500009536743, "reward_std": 0.5332643985748291, "rewards/correctness_reward_func": 1.5625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.45875000953674316, "step": 2164 }, { "completion_length": 137.1875, "epoch": 1.1583734617442483, "grad_norm": 1.7077977657318115, "kl": 0.24310508370399475, "learning_rate": 3.853625022802892e-06, "loss": 0.0097, "reward": 1.7770625352859497, "reward_std": 0.9874175190925598, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4333125054836273, "step": 2165 }, { "completion_length": 128.84375, "epoch": 1.158908507223114, "grad_norm": 1.6374872922897339, "kl": 0.3143743872642517, "learning_rate": 3.852316179265226e-06, "loss": 0.0126, "reward": 1.5525000095367432, "reward_std": 0.7520537376403809, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4274999797344208, "step": 2166 }, { "completion_length": 122.5625, "epoch": 1.1594435527019797, "grad_norm": 0.9122248291969299, "kl": 0.25497254729270935, "learning_rate": 3.851006811543896e-06, "loss": 0.0102, "reward": 1.5771875381469727, "reward_std": 0.6195245981216431, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.46781250834465027, "step": 2167 }, { "completion_length": 123.59375, "epoch": 1.1599785981808455, "grad_norm": 0.6831244230270386, "kl": 0.1577405333518982, "learning_rate": 3.849696920146437e-06, "loss": 0.0063, "reward": 2.3894686698913574, "reward_std": 0.9387059211730957, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.4363437592983246, "step": 2168 }, { "completion_length": 128.03125, "epoch": 1.160513643659711, "grad_norm": 3.1728386878967285, "kl": 0.20218344032764435, "learning_rate": 3.848386505580588e-06, "loss": 0.0081, "reward": 1.507406234741211, "reward_std": 0.7021282911300659, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.44490623474121094, "step": 2169 }, { "completion_length": 117.0625, "epoch": 1.1610486891385767, "grad_norm": 1.4859126806259155, "kl": 0.19728511571884155, "learning_rate": 3.847075568354291e-06, "loss": 0.0079, "reward": 2.2334060668945312, "reward_std": 0.8946541547775269, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4834062457084656, "step": 2170 }, { "completion_length": 160.15625, "epoch": 1.1615837346174425, "grad_norm": 1.3913304805755615, "kl": 0.1337127536535263, "learning_rate": 3.845764108975689e-06, "loss": 0.0053, "reward": 1.602718710899353, "reward_std": 0.9258275032043457, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.3839687407016754, "step": 2171 }, { "completion_length": 132.9375, "epoch": 1.1621187800963082, "grad_norm": 0.497121661901474, "kl": 0.15159469842910767, "learning_rate": 3.844452127953131e-06, "loss": 0.0061, "reward": 2.2349374294281006, "reward_std": 0.3730839490890503, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.45368748903274536, "step": 2172 }, { "completion_length": 127.375, "epoch": 1.162653825575174, "grad_norm": 5.336389064788818, "kl": 0.1715376228094101, "learning_rate": 3.8431396257951655e-06, "loss": 0.0069, "reward": 2.1973438262939453, "reward_std": 1.1423622369766235, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46296876668930054, "step": 2173 }, { "completion_length": 123.34375, "epoch": 1.1631888710540397, "grad_norm": 0.9615598320960999, "kl": 0.15178492665290833, "learning_rate": 3.8418266030105424e-06, "loss": 0.0061, "reward": 2.22265625, "reward_std": 0.8155076503753662, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.48828125, "step": 2174 }, { "completion_length": 121.25, "epoch": 1.1637239165329052, "grad_norm": 4.968196868896484, "kl": 0.30560386180877686, "learning_rate": 3.840513060108215e-06, "loss": 0.0122, "reward": 1.6552499532699585, "reward_std": 0.6596767902374268, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.46775001287460327, "step": 2175 }, { "completion_length": 124.53125, "epoch": 1.164258962011771, "grad_norm": 0.8751515746116638, "kl": 0.18315894901752472, "learning_rate": 3.839198997597338e-06, "loss": 0.0073, "reward": 2.4535000324249268, "reward_std": 0.7106964588165283, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.45350000262260437, "step": 2176 }, { "completion_length": 158.28125, "epoch": 1.1647940074906367, "grad_norm": 0.9156041741371155, "kl": 0.16957879066467285, "learning_rate": 3.837884415987266e-06, "loss": 0.0068, "reward": 1.6713438034057617, "reward_std": 0.9452221393585205, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.43696874380111694, "step": 2177 }, { "completion_length": 138.09375, "epoch": 1.1653290529695024, "grad_norm": 0.764622151851654, "kl": 0.14417022466659546, "learning_rate": 3.836569315787558e-06, "loss": 0.0058, "reward": 1.5272812843322754, "reward_std": 0.5476966500282288, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.496031254529953, "step": 2178 }, { "completion_length": 128.5625, "epoch": 1.1658640984483681, "grad_norm": 0.9400740265846252, "kl": 0.18944862484931946, "learning_rate": 3.83525369750797e-06, "loss": 0.0076, "reward": 2.33203125, "reward_std": 0.9053213596343994, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48828125, "step": 2179 }, { "completion_length": 113.65625, "epoch": 1.1663991439272339, "grad_norm": 0.9068821668624878, "kl": 0.2563968002796173, "learning_rate": 3.833937561658463e-06, "loss": 0.0103, "reward": 2.628000020980835, "reward_std": 0.8078013062477112, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.47175002098083496, "step": 2180 }, { "completion_length": 134.9375, "epoch": 1.1669341894060996, "grad_norm": 1.7631138563156128, "kl": 0.27898186445236206, "learning_rate": 3.832620908749196e-06, "loss": 0.0112, "reward": 1.373843789100647, "reward_std": 0.7726234197616577, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.436343789100647, "step": 2181 }, { "completion_length": 149.375, "epoch": 1.1674692348849653, "grad_norm": 0.6589639782905579, "kl": 0.1285209059715271, "learning_rate": 3.8313037392905266e-06, "loss": 0.0051, "reward": 1.0905312299728394, "reward_std": 0.3301030695438385, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.41865625977516174, "step": 2182 }, { "completion_length": 128.28125, "epoch": 1.1680042803638309, "grad_norm": 0.6642359495162964, "kl": 0.16424083709716797, "learning_rate": 3.8299860537930175e-06, "loss": 0.0066, "reward": 1.73618745803833, "reward_std": 0.4720972180366516, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48618751764297485, "step": 2183 }, { "completion_length": 149.40625, "epoch": 1.1685393258426966, "grad_norm": 1.120681643486023, "kl": 0.1554037183523178, "learning_rate": 3.828667852767428e-06, "loss": 0.0062, "reward": 1.5641250610351562, "reward_std": 0.9091575145721436, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4235000014305115, "step": 2184 }, { "completion_length": 112.90625, "epoch": 1.1690743713215623, "grad_norm": 1.4905933141708374, "kl": 0.20579370856285095, "learning_rate": 3.827349136724717e-06, "loss": 0.0082, "reward": 2.337718963623047, "reward_std": 0.7946338057518005, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.47834375500679016, "step": 2185 }, { "completion_length": 151.46875, "epoch": 1.169609416800428, "grad_norm": 1.0442358255386353, "kl": 0.18560674786567688, "learning_rate": 3.826029906176047e-06, "loss": 0.0074, "reward": 1.4535937309265137, "reward_std": 0.5676496624946594, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46921876072883606, "step": 2186 }, { "completion_length": 110.1875, "epoch": 1.1701444622792938, "grad_norm": 1.2605600357055664, "kl": 0.226331889629364, "learning_rate": 3.824710161632773e-06, "loss": 0.0091, "reward": 2.714156150817871, "reward_std": 0.5852946043014526, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.49540624022483826, "step": 2187 }, { "completion_length": 139.53125, "epoch": 1.1706795077581593, "grad_norm": 1.298699140548706, "kl": 0.17168636620044708, "learning_rate": 3.823389903606457e-06, "loss": 0.0069, "reward": 1.9975625276565552, "reward_std": 0.6543107628822327, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4350624978542328, "step": 2188 }, { "completion_length": 112.9375, "epoch": 1.171214553237025, "grad_norm": 2.544356107711792, "kl": 0.2160005420446396, "learning_rate": 3.8220691326088545e-06, "loss": 0.0086, "reward": 2.6138439178466797, "reward_std": 0.8916547298431396, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48884373903274536, "step": 2189 }, { "completion_length": 127.46875, "epoch": 1.1717495987158908, "grad_norm": 0.9130262136459351, "kl": 0.26214170455932617, "learning_rate": 3.820747849151922e-06, "loss": 0.0105, "reward": 1.875, "reward_std": 0.5043455362319946, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46875, "step": 2190 }, { "completion_length": 114.125, "epoch": 1.1722846441947565, "grad_norm": 0.9813275337219238, "kl": 0.17820321023464203, "learning_rate": 3.819426053747814e-06, "loss": 0.0071, "reward": 2.7119061946868896, "reward_std": 1.0232970714569092, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.4931562542915344, "step": 2191 }, { "completion_length": 109.34375, "epoch": 1.1728196896736223, "grad_norm": 1.099817156791687, "kl": 0.24836108088493347, "learning_rate": 3.818103746908883e-06, "loss": 0.0099, "reward": 2.2736563682556152, "reward_std": 0.9775730967521667, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4924062490463257, "step": 2192 }, { "completion_length": 115.9375, "epoch": 1.173354735152488, "grad_norm": 1.6554797887802124, "kl": 0.21259459853172302, "learning_rate": 3.816780929147682e-06, "loss": 0.0085, "reward": 2.1909375190734863, "reward_std": 0.6109352111816406, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48781251907348633, "step": 2193 }, { "completion_length": 125.84375, "epoch": 1.1738897806313537, "grad_norm": 7.21136474609375, "kl": 0.4180801212787628, "learning_rate": 3.81545760097696e-06, "loss": 0.0167, "reward": 2.921875, "reward_std": 0.8253346681594849, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.5, "step": 2194 }, { "completion_length": 130.46875, "epoch": 1.1744248261102195, "grad_norm": 0.75071781873703, "kl": 0.18937024474143982, "learning_rate": 3.8141337629096646e-06, "loss": 0.0076, "reward": 1.932437539100647, "reward_std": 0.5767219662666321, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4636875092983246, "step": 2195 }, { "completion_length": 136.15625, "epoch": 1.174959871589085, "grad_norm": 0.8382118940353394, "kl": 0.17466028034687042, "learning_rate": 3.81280941545894e-06, "loss": 0.007, "reward": 2.2852187156677246, "reward_std": 1.0054489374160767, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4727187752723694, "step": 2196 }, { "completion_length": 141.96875, "epoch": 1.1754949170679507, "grad_norm": 1.9714915752410889, "kl": 0.18352439999580383, "learning_rate": 3.811484559138129e-06, "loss": 0.0073, "reward": 1.5311875343322754, "reward_std": 0.5602625608444214, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.453062504529953, "step": 2197 }, { "completion_length": 149.28125, "epoch": 1.1760299625468165, "grad_norm": 0.6997100710868835, "kl": 0.17866039276123047, "learning_rate": 3.810159194460772e-06, "loss": 0.0071, "reward": 2.357062578201294, "reward_std": 0.9831547141075134, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46643751859664917, "step": 2198 }, { "completion_length": 132.71875, "epoch": 1.1765650080256822, "grad_norm": 2.8779172897338867, "kl": 0.20484423637390137, "learning_rate": 3.808833321940605e-06, "loss": 0.0082, "reward": 1.926281213760376, "reward_std": 0.7311553955078125, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47315624356269836, "step": 2199 }, { "completion_length": 106.1875, "epoch": 1.177100053504548, "grad_norm": 2.0353758335113525, "kl": 0.21497194468975067, "learning_rate": 3.8075069420915612e-06, "loss": 0.0086, "reward": 2.415781259536743, "reward_std": 0.751129150390625, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.49390625953674316, "step": 2200 }, { "completion_length": 143.75, "epoch": 1.1776350989834137, "grad_norm": 1.0251721143722534, "kl": 0.20210449397563934, "learning_rate": 3.8061800554277722e-06, "loss": 0.0081, "reward": 2.121218681335449, "reward_std": 0.5713167786598206, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.449343740940094, "step": 2201 }, { "completion_length": 137.65625, "epoch": 1.1781701444622792, "grad_norm": 0.8914329409599304, "kl": 0.1524733603000641, "learning_rate": 3.8048526624635627e-06, "loss": 0.0061, "reward": 1.987125039100647, "reward_std": 0.6975269913673401, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4402500092983246, "step": 2202 }, { "completion_length": 129.3125, "epoch": 1.178705189941145, "grad_norm": 2.3900504112243652, "kl": 0.2450343668460846, "learning_rate": 3.803524763713458e-06, "loss": 0.0098, "reward": 2.6205313205718994, "reward_std": 1.04849112033844, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.46428126096725464, "step": 2203 }, { "completion_length": 132.21875, "epoch": 1.1792402354200107, "grad_norm": 1.2581814527511597, "kl": 0.16924774646759033, "learning_rate": 3.802196359692176e-06, "loss": 0.0068, "reward": 2.249000072479248, "reward_std": 0.6477094292640686, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.45212501287460327, "step": 2204 }, { "completion_length": 127.34375, "epoch": 1.1797752808988764, "grad_norm": 0.5116264224052429, "kl": 0.16707774996757507, "learning_rate": 3.80086745091463e-06, "loss": 0.0067, "reward": 2.1968750953674316, "reward_std": 0.5635108947753906, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4781250059604645, "step": 2205 }, { "completion_length": 131.90625, "epoch": 1.1803103263777421, "grad_norm": 0.9540043473243713, "kl": 0.22488802671432495, "learning_rate": 3.7995380378959335e-06, "loss": 0.009, "reward": 2.254718780517578, "reward_std": 0.8769513368606567, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.45784375071525574, "step": 2206 }, { "completion_length": 124.71875, "epoch": 1.1808453718566079, "grad_norm": 0.9464832544326782, "kl": 0.19416877627372742, "learning_rate": 3.798208121151391e-06, "loss": 0.0078, "reward": 2.393843650817871, "reward_std": 1.0710136890411377, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.47196874022483826, "step": 2207 }, { "completion_length": 128.0625, "epoch": 1.1813804173354736, "grad_norm": 3.55533504486084, "kl": 0.36075717210769653, "learning_rate": 3.796877701196503e-06, "loss": 0.0144, "reward": 2.111062526702881, "reward_std": 0.7580853700637817, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.42356249690055847, "step": 2208 }, { "completion_length": 129.5625, "epoch": 1.1819154628143391, "grad_norm": 0.8839014768600464, "kl": 0.15787279605865479, "learning_rate": 3.795546778546967e-06, "loss": 0.0063, "reward": 2.275937557220459, "reward_std": 1.0100288391113281, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4634374976158142, "step": 2209 }, { "completion_length": 128.9375, "epoch": 1.1824505082932049, "grad_norm": 0.7458848357200623, "kl": 0.19607268273830414, "learning_rate": 3.7942153537186733e-06, "loss": 0.0078, "reward": 2.41796875, "reward_std": 0.9892290830612183, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46484375, "step": 2210 }, { "completion_length": 102.53125, "epoch": 1.1829855537720706, "grad_norm": 0.5570758581161499, "kl": 0.1648346334695816, "learning_rate": 3.7928834272277073e-06, "loss": 0.0066, "reward": 2.5712811946868896, "reward_std": 0.5102128386497498, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4931562542915344, "step": 2211 }, { "completion_length": 149.53125, "epoch": 1.1835205992509363, "grad_norm": 0.8850358724594116, "kl": 0.14670908451080322, "learning_rate": 3.791550999590351e-06, "loss": 0.0059, "reward": 1.5894999504089355, "reward_std": 0.9354082345962524, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4176250100135803, "step": 2212 }, { "completion_length": 112.1875, "epoch": 1.184055644729802, "grad_norm": 1.1114599704742432, "kl": 0.19661200046539307, "learning_rate": 3.7902180713230776e-06, "loss": 0.0079, "reward": 1.9022186994552612, "reward_std": 0.23929928243160248, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4803437292575836, "step": 2213 }, { "completion_length": 126.46875, "epoch": 1.1845906902086678, "grad_norm": 1.5538369417190552, "kl": 0.23069441318511963, "learning_rate": 3.788884642942555e-06, "loss": 0.0092, "reward": 1.888781189918518, "reward_std": 0.8965114951133728, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.42003124952316284, "step": 2214 }, { "completion_length": 116.9375, "epoch": 1.1851257356875333, "grad_norm": 1.3902121782302856, "kl": 0.2275538593530655, "learning_rate": 3.7875507149656465e-06, "loss": 0.0091, "reward": 3.125, "reward_std": 0.49871626496315, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 2215 }, { "completion_length": 98.90625, "epoch": 1.185660781166399, "grad_norm": 0.9156366586685181, "kl": 0.21116122603416443, "learning_rate": 3.786216287909409e-06, "loss": 0.0084, "reward": 2.9870314598083496, "reward_std": 0.7380447387695312, "rewards/correctness_reward_func": 1.5625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4870312511920929, "step": 2216 }, { "completion_length": 141.96875, "epoch": 1.1861958266452648, "grad_norm": 0.732574999332428, "kl": 0.1729746162891388, "learning_rate": 3.7848813622910896e-06, "loss": 0.0069, "reward": 2.0136561393737793, "reward_std": 0.9373753666877747, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46678125858306885, "step": 2217 }, { "completion_length": 138.6875, "epoch": 1.1867308721241305, "grad_norm": 2.7605974674224854, "kl": 0.18410533666610718, "learning_rate": 3.7835459386281336e-06, "loss": 0.0074, "reward": 1.3978750705718994, "reward_std": 0.7706713676452637, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.42912501096725464, "step": 2218 }, { "completion_length": 149.0625, "epoch": 1.1872659176029963, "grad_norm": 1.9304622411727905, "kl": 0.19189657270908356, "learning_rate": 3.7822100174381754e-06, "loss": 0.0077, "reward": 1.8390313386917114, "reward_std": 1.0465140342712402, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.3859062194824219, "step": 2219 }, { "completion_length": 130.875, "epoch": 1.187800963081862, "grad_norm": 1.3478533029556274, "kl": 0.2149626910686493, "learning_rate": 3.7808735992390443e-06, "loss": 0.0086, "reward": 2.874281406402588, "reward_std": 0.5519306659698486, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46803125739097595, "step": 2220 }, { "completion_length": 149.59375, "epoch": 1.1883360085607277, "grad_norm": 0.9931278228759766, "kl": 0.1622251570224762, "learning_rate": 3.77953668454876e-06, "loss": 0.0065, "reward": 2.0864062309265137, "reward_std": 1.0840773582458496, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.43015623092651367, "step": 2221 }, { "completion_length": 144.40625, "epoch": 1.1888710540395935, "grad_norm": 0.8390902876853943, "kl": 0.1575823873281479, "learning_rate": 3.778199273885539e-06, "loss": 0.0063, "reward": 1.5998749732971191, "reward_std": 0.9205014705657959, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.47487500309944153, "step": 2222 }, { "completion_length": 128.75, "epoch": 1.189406099518459, "grad_norm": 1.1622252464294434, "kl": 0.1709892898797989, "learning_rate": 3.776861367767785e-06, "loss": 0.0068, "reward": 2.379093647003174, "reward_std": 0.975707471370697, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4572187662124634, "step": 2223 }, { "completion_length": 150.5625, "epoch": 1.1899411449973247, "grad_norm": 1.0992989540100098, "kl": 0.18956004083156586, "learning_rate": 3.7755229667140972e-06, "loss": 0.0076, "reward": 1.843906283378601, "reward_std": 1.1768018007278442, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4220312535762787, "step": 2224 }, { "completion_length": 162.96875, "epoch": 1.1904761904761905, "grad_norm": 2.0330567359924316, "kl": 0.21466735005378723, "learning_rate": 3.774184071243264e-06, "loss": 0.0086, "reward": 1.7815937995910645, "reward_std": 0.8730887770652771, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4065937399864197, "step": 2225 }, { "completion_length": 155.5, "epoch": 1.1910112359550562, "grad_norm": 1.3669075965881348, "kl": 0.2545560300350189, "learning_rate": 3.7728446818742696e-06, "loss": 0.0102, "reward": 1.797374963760376, "reward_std": 1.011420488357544, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.40674999356269836, "step": 2226 }, { "completion_length": 125.75, "epoch": 1.191546281433922, "grad_norm": 0.8555120229721069, "kl": 0.1956443041563034, "learning_rate": 3.7715047991262854e-06, "loss": 0.0078, "reward": 2.10546875, "reward_std": 0.7153191566467285, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.48046875, "step": 2227 }, { "completion_length": 120.9375, "epoch": 1.1920813269127877, "grad_norm": 0.9226614236831665, "kl": 0.17853814363479614, "learning_rate": 3.7701644235186753e-06, "loss": 0.0071, "reward": 1.9453125, "reward_std": 0.7404108643531799, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4765625, "step": 2228 }, { "completion_length": 140.5625, "epoch": 1.1926163723916532, "grad_norm": 0.9419973492622375, "kl": 0.1633533388376236, "learning_rate": 3.7688235555709953e-06, "loss": 0.0065, "reward": 1.76381254196167, "reward_std": 0.7568950057029724, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46693748235702515, "step": 2229 }, { "completion_length": 135.90625, "epoch": 1.193151417870519, "grad_norm": 1.6125973463058472, "kl": 0.1906615048646927, "learning_rate": 3.7674821958029904e-06, "loss": 0.0076, "reward": 2.0834999084472656, "reward_std": 1.0356298685073853, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4428749978542328, "step": 2230 }, { "completion_length": 132.8125, "epoch": 1.1936864633493847, "grad_norm": 1.745357632637024, "kl": 0.1747809201478958, "learning_rate": 3.7661403447345983e-06, "loss": 0.007, "reward": 2.155937671661377, "reward_std": 0.9576929211616516, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4371874928474426, "step": 2231 }, { "completion_length": 158.1875, "epoch": 1.1942215088282504, "grad_norm": 1.7476541996002197, "kl": 0.18176579475402832, "learning_rate": 3.7647980028859466e-06, "loss": 0.0073, "reward": 2.1778125762939453, "reward_std": 0.7822679281234741, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.42781251668930054, "step": 2232 }, { "completion_length": 122.84375, "epoch": 1.1947565543071161, "grad_norm": 1.1541004180908203, "kl": 0.17818573117256165, "learning_rate": 3.7634551707773516e-06, "loss": 0.0071, "reward": 1.61328125, "reward_std": 0.5854257345199585, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48828125, "step": 2233 }, { "completion_length": 165.03125, "epoch": 1.1952915997859819, "grad_norm": 0.8103793263435364, "kl": 0.16168701648712158, "learning_rate": 3.762111848929322e-06, "loss": 0.0065, "reward": 1.7646563053131104, "reward_std": 1.0172481536865234, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.3584062457084656, "step": 2234 }, { "completion_length": 151.40625, "epoch": 1.1958266452648476, "grad_norm": 0.7488526701927185, "kl": 0.17767445743083954, "learning_rate": 3.760768037862553e-06, "loss": 0.0071, "reward": 2.163062572479248, "reward_std": 0.8804678916931152, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.42868751287460327, "step": 2235 }, { "completion_length": 117.3125, "epoch": 1.1963616907437131, "grad_norm": 1.0369850397109985, "kl": 0.23237943649291992, "learning_rate": 3.7594237380979343e-06, "loss": 0.0093, "reward": 2.0175938606262207, "reward_std": 1.044619083404541, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.47071874141693115, "step": 2236 }, { "completion_length": 144.0625, "epoch": 1.1968967362225789, "grad_norm": 1.122534990310669, "kl": 0.1920514702796936, "learning_rate": 3.7580789501565407e-06, "loss": 0.0077, "reward": 1.408562421798706, "reward_std": 0.5566316246986389, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.45543748140335083, "step": 2237 }, { "completion_length": 128.6875, "epoch": 1.1974317817014446, "grad_norm": 1.3655139207839966, "kl": 0.18444982171058655, "learning_rate": 3.756733674559638e-06, "loss": 0.0074, "reward": 2.19921875, "reward_std": 0.9093985557556152, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46484375, "step": 2238 }, { "completion_length": 155.03125, "epoch": 1.1979668271803103, "grad_norm": 0.6008023619651794, "kl": 0.14829501509666443, "learning_rate": 3.7553879118286794e-06, "loss": 0.0059, "reward": 1.50390625, "reward_std": 0.6057882308959961, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.41015625, "step": 2239 }, { "completion_length": 127.15625, "epoch": 1.198501872659176, "grad_norm": 0.8716109991073608, "kl": 0.16748860478401184, "learning_rate": 3.754041662485311e-06, "loss": 0.0067, "reward": 1.8740313053131104, "reward_std": 0.6199586987495422, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4677812457084656, "step": 2240 }, { "completion_length": 122.0625, "epoch": 1.1990369181380418, "grad_norm": 2.404637098312378, "kl": 0.24567875266075134, "learning_rate": 3.7526949270513637e-06, "loss": 0.0098, "reward": 2.3878438472747803, "reward_std": 1.0466084480285645, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4815937578678131, "step": 2241 }, { "completion_length": 118.3125, "epoch": 1.1995719636169073, "grad_norm": 3.3809738159179688, "kl": 0.3144787847995758, "learning_rate": 3.751347706048857e-06, "loss": 0.0126, "reward": 2.2694687843322754, "reward_std": 0.7649358510971069, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.488218754529953, "step": 2242 }, { "completion_length": 132.875, "epoch": 1.200107009095773, "grad_norm": 2.4589595794677734, "kl": 0.32542717456817627, "learning_rate": 3.7500000000000005e-06, "loss": 0.013, "reward": 1.7616875171661377, "reward_std": 0.6641401052474976, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4648125171661377, "step": 2243 }, { "completion_length": 120.875, "epoch": 1.2006420545746388, "grad_norm": 1.617456316947937, "kl": 0.22110188007354736, "learning_rate": 3.7486518094271907e-06, "loss": 0.0088, "reward": 1.937250018119812, "reward_std": 0.6931746602058411, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.499750018119812, "step": 2244 }, { "completion_length": 134.34375, "epoch": 1.2011771000535045, "grad_norm": 3.631133556365967, "kl": 0.5421677231788635, "learning_rate": 3.747303134853012e-06, "loss": 0.0217, "reward": 1.2762500047683716, "reward_std": 0.4146111309528351, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4481250047683716, "step": 2245 }, { "completion_length": 135.5625, "epoch": 1.2017121455323703, "grad_norm": 0.6740769147872925, "kl": 0.17130914330482483, "learning_rate": 3.7459539768002375e-06, "loss": 0.0069, "reward": 2.0155625343322754, "reward_std": 0.6812435388565063, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.468687504529953, "step": 2246 }, { "completion_length": 154.71875, "epoch": 1.202247191011236, "grad_norm": 10.208616256713867, "kl": 3.030829429626465, "learning_rate": 3.744604335791825e-06, "loss": 0.1212, "reward": 2.2318124771118164, "reward_std": 1.1581560373306274, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4349375069141388, "step": 2247 }, { "completion_length": 141.15625, "epoch": 1.2027822364901017, "grad_norm": 1.0225558280944824, "kl": 0.20366553962230682, "learning_rate": 3.743254212350922e-06, "loss": 0.0081, "reward": 2.24637508392334, "reward_std": 0.8971000909805298, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4807499945163727, "step": 2248 }, { "completion_length": 126.375, "epoch": 1.2033172819689675, "grad_norm": 0.5431745052337646, "kl": 0.16403120756149292, "learning_rate": 3.7419036070008625e-06, "loss": 0.0066, "reward": 2.609375, "reward_std": 0.5429104566574097, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.5, "step": 2249 }, { "completion_length": 123.75, "epoch": 1.203852327447833, "grad_norm": 1.9458101987838745, "kl": 0.326921284198761, "learning_rate": 3.740552520265167e-06, "loss": 0.0131, "reward": 1.6946874856948853, "reward_std": 0.7660499811172485, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.44468748569488525, "step": 2250 }, { "completion_length": 139.34375, "epoch": 1.2043873729266987, "grad_norm": 1.306922197341919, "kl": 0.21451455354690552, "learning_rate": 3.739200952667542e-06, "loss": 0.0086, "reward": 2.297999858856201, "reward_std": 0.41988882422447205, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4698750078678131, "step": 2251 }, { "completion_length": 153.59375, "epoch": 1.2049224184055645, "grad_norm": 0.7228028178215027, "kl": 0.17926539480686188, "learning_rate": 3.737848904731882e-06, "loss": 0.0072, "reward": 1.833343744277954, "reward_std": 0.9460963010787964, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4114687442779541, "step": 2252 }, { "completion_length": 125.15625, "epoch": 1.2054574638844302, "grad_norm": 0.7493937611579895, "kl": 0.18273913860321045, "learning_rate": 3.736496376982266e-06, "loss": 0.0073, "reward": 1.9945937395095825, "reward_std": 0.8893576860427856, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4477187395095825, "step": 2253 }, { "completion_length": 122.90625, "epoch": 1.205992509363296, "grad_norm": 1.0810211896896362, "kl": 0.21243730187416077, "learning_rate": 3.7351433699429596e-06, "loss": 0.0085, "reward": 2.1189687252044678, "reward_std": 0.9850417375564575, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.46271878480911255, "step": 2254 }, { "completion_length": 149.5, "epoch": 1.2065275548421617, "grad_norm": 4.598564147949219, "kl": 0.14505775272846222, "learning_rate": 3.733789884138415e-06, "loss": 0.0058, "reward": 2.6712188720703125, "reward_std": 0.7874492406845093, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48371875286102295, "step": 2255 }, { "completion_length": 132.5, "epoch": 1.2070626003210272, "grad_norm": 0.9237064123153687, "kl": 0.1676744818687439, "learning_rate": 3.7324359200932675e-06, "loss": 0.0067, "reward": 2.36146879196167, "reward_std": 1.066443681716919, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.45521873235702515, "step": 2256 }, { "completion_length": 131.53125, "epoch": 1.207597645799893, "grad_norm": 1.3540353775024414, "kl": 0.19919449090957642, "learning_rate": 3.731081478332341e-06, "loss": 0.008, "reward": 1.719249963760376, "reward_std": 0.35792550444602966, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46924999356269836, "step": 2257 }, { "completion_length": 150.6875, "epoch": 1.2081326912787587, "grad_norm": 1.4948123693466187, "kl": 0.23936668038368225, "learning_rate": 3.729726559380643e-06, "loss": 0.0096, "reward": 1.78515625, "reward_std": 1.0581841468811035, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.45703125, "step": 2258 }, { "completion_length": 137.28125, "epoch": 1.2086677367576244, "grad_norm": 10.076682090759277, "kl": 0.9620591402053833, "learning_rate": 3.728371163763366e-06, "loss": 0.0385, "reward": 2.3778750896453857, "reward_std": 0.9416611194610596, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4560000002384186, "step": 2259 }, { "completion_length": 138.65625, "epoch": 1.2092027822364901, "grad_norm": 7123579904.0, "kl": 243317872.0, "learning_rate": 3.7270152920058874e-06, "loss": 9732714.0, "reward": 1.9209063053131104, "reward_std": 0.31368693709373474, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4677812457084656, "step": 2260 }, { "completion_length": 113.0, "epoch": 1.2097378277153559, "grad_norm": 0.8845975399017334, "kl": 0.20026829838752747, "learning_rate": 3.7256589446337686e-06, "loss": 0.008, "reward": 2.0234687328338623, "reward_std": 0.7322860956192017, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4922187328338623, "step": 2261 }, { "completion_length": 133.46875, "epoch": 1.2102728731942216, "grad_norm": 1.2294999361038208, "kl": 0.17958980798721313, "learning_rate": 3.7243021221727553e-06, "loss": 0.0072, "reward": 1.9196562767028809, "reward_std": 0.578608512878418, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.48215624690055847, "step": 2262 }, { "completion_length": 159.1875, "epoch": 1.2108079186730871, "grad_norm": 0.8232427835464478, "kl": 0.13135719299316406, "learning_rate": 3.7229448251487795e-06, "loss": 0.0053, "reward": 2.092156171798706, "reward_std": 1.125576376914978, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4515312612056732, "step": 2263 }, { "completion_length": 148.875, "epoch": 1.2113429641519529, "grad_norm": 1.4732403755187988, "kl": 0.20322026312351227, "learning_rate": 3.7215870540879545e-06, "loss": 0.0081, "reward": 1.3406562805175781, "reward_std": 0.6961737275123596, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.41878125071525574, "step": 2264 }, { "completion_length": 139.84375, "epoch": 1.2118780096308186, "grad_norm": 1.0589803457260132, "kl": 0.13693389296531677, "learning_rate": 3.7202288095165776e-06, "loss": 0.0055, "reward": 2.2514374256134033, "reward_std": 0.8503342866897583, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4701874852180481, "step": 2265 }, { "completion_length": 133.09375, "epoch": 1.2124130551096843, "grad_norm": 275.804931640625, "kl": 41.799034118652344, "learning_rate": 3.7188700919611326e-06, "loss": 1.672, "reward": 2.979468822479248, "reward_std": 0.9225842356681824, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4638437330722809, "step": 2266 }, { "completion_length": 142.96875, "epoch": 1.21294810058855, "grad_norm": 1.5350574254989624, "kl": 0.22888273000717163, "learning_rate": 3.7175109019482826e-06, "loss": 0.0092, "reward": 2.1493124961853027, "reward_std": 1.0619460344314575, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.46181249618530273, "step": 2267 }, { "completion_length": 119.28125, "epoch": 1.2134831460674158, "grad_norm": 2.074669361114502, "kl": 0.2916223108768463, "learning_rate": 3.716151240004876e-06, "loss": 0.0117, "reward": 1.9844374656677246, "reward_std": 0.6296237707138062, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.453187495470047, "step": 2268 }, { "completion_length": 130.1875, "epoch": 1.2140181915462813, "grad_norm": 5.5946269035339355, "kl": 0.4493938982486725, "learning_rate": 3.7147911066579453e-06, "loss": 0.018, "reward": 2.200000047683716, "reward_std": 0.6742970943450928, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.46562498807907104, "step": 2269 }, { "completion_length": 120.28125, "epoch": 1.214553237025147, "grad_norm": 2.5491044521331787, "kl": 0.2295900583267212, "learning_rate": 3.713430502434702e-06, "loss": 0.0092, "reward": 2.5514373779296875, "reward_std": 0.776269257068634, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47331249713897705, "step": 2270 }, { "completion_length": 139.6875, "epoch": 1.2150882825040128, "grad_norm": 1.2039616107940674, "kl": 0.2585282325744629, "learning_rate": 3.712069427862544e-06, "loss": 0.0103, "reward": 1.2524999380111694, "reward_std": 0.5160866379737854, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4399999976158142, "step": 2271 }, { "completion_length": 144.09375, "epoch": 1.2156233279828785, "grad_norm": 1.1556445360183716, "kl": 0.25063252449035645, "learning_rate": 3.710707883469049e-06, "loss": 0.01, "reward": 1.3580937385559082, "reward_std": 0.7017271518707275, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.4205937683582306, "step": 2272 }, { "completion_length": 135.34375, "epoch": 1.2161583734617443, "grad_norm": 1.575579285621643, "kl": 0.20341363549232483, "learning_rate": 3.7093458697819783e-06, "loss": 0.0081, "reward": 1.8427813053131104, "reward_std": 0.9306573867797852, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4521562457084656, "step": 2273 }, { "completion_length": 122.84375, "epoch": 1.21669341894061, "grad_norm": 0.793117105960846, "kl": 0.18166731297969818, "learning_rate": 3.707983387329274e-06, "loss": 0.0073, "reward": 2.6698436737060547, "reward_std": 0.70228111743927, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48234376311302185, "step": 2274 }, { "completion_length": 129.34375, "epoch": 1.2172284644194757, "grad_norm": 0.38876864314079285, "kl": 0.15258587896823883, "learning_rate": 3.706620436639061e-06, "loss": 0.0061, "reward": 1.8826875686645508, "reward_std": 0.150110125541687, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.476437509059906, "step": 2275 }, { "completion_length": 114.5625, "epoch": 1.2177635098983415, "grad_norm": 0.6028956174850464, "kl": 0.1578536480665207, "learning_rate": 3.7052570182396443e-06, "loss": 0.0063, "reward": 2.749875068664551, "reward_std": 0.5117098093032837, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.499875009059906, "step": 2276 }, { "completion_length": 152.25, "epoch": 1.218298555377207, "grad_norm": 0.8034933805465698, "kl": 0.1542086899280548, "learning_rate": 3.7038931326595116e-06, "loss": 0.0062, "reward": 1.73046875, "reward_std": 0.7092485427856445, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46484375, "step": 2277 }, { "completion_length": 111.5, "epoch": 1.2188336008560727, "grad_norm": 2.131197929382324, "kl": 0.22142836451530457, "learning_rate": 3.7025287804273314e-06, "loss": 0.0089, "reward": 2.9791250228881836, "reward_std": 0.7042322754859924, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.4947499930858612, "step": 2278 }, { "completion_length": 141.4375, "epoch": 1.2193686463349385, "grad_norm": 1.927463173866272, "kl": 0.1785297691822052, "learning_rate": 3.701163962071952e-06, "loss": 0.0071, "reward": 1.9470000267028809, "reward_std": 0.9313443303108215, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47824999690055847, "step": 2279 }, { "completion_length": 123.9375, "epoch": 1.2199036918138042, "grad_norm": 0.8240671157836914, "kl": 0.18778195977210999, "learning_rate": 3.6997986781224032e-06, "loss": 0.0075, "reward": 2.15625, "reward_std": 0.5422549843788147, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.5, "step": 2280 }, { "completion_length": 148.5625, "epoch": 1.22043873729267, "grad_norm": 1.3259937763214111, "kl": 0.1782839447259903, "learning_rate": 3.6984329291078963e-06, "loss": 0.0071, "reward": 1.9920625686645508, "reward_std": 0.9886474609375, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.476437509059906, "step": 2281 }, { "completion_length": 133.90625, "epoch": 1.2209737827715357, "grad_norm": 0.7478596568107605, "kl": 0.15623082220554352, "learning_rate": 3.6970667155578193e-06, "loss": 0.0062, "reward": 2.175468683242798, "reward_std": 0.603184700012207, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4879687428474426, "step": 2282 }, { "completion_length": 150.25, "epoch": 1.2215088282504012, "grad_norm": 0.8103448748588562, "kl": 0.18947236239910126, "learning_rate": 3.695700038001746e-06, "loss": 0.0076, "reward": 1.2789688110351562, "reward_std": 0.6724315881729126, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.4352187514305115, "step": 2283 }, { "completion_length": 123.875, "epoch": 1.222043873729267, "grad_norm": 1.5449079275131226, "kl": 0.3066404461860657, "learning_rate": 3.694332896969425e-06, "loss": 0.0123, "reward": 1.662000060081482, "reward_std": 0.6590516567230225, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.45887500047683716, "step": 2284 }, { "completion_length": 125.3125, "epoch": 1.2225789192081327, "grad_norm": 0.8818852305412292, "kl": 0.20600669085979462, "learning_rate": 3.6929652929907862e-06, "loss": 0.0082, "reward": 1.9771249294281006, "reward_std": 0.8814501166343689, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.44587498903274536, "step": 2285 }, { "completion_length": 148.5625, "epoch": 1.2231139646869984, "grad_norm": 1.1627204418182373, "kl": 0.16496093571186066, "learning_rate": 3.69159722659594e-06, "loss": 0.0066, "reward": 1.8968124389648438, "reward_std": 0.9986473321914673, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4280624985694885, "step": 2286 }, { "completion_length": 133.25, "epoch": 1.2236490101658641, "grad_norm": 1.3698681592941284, "kl": 0.16289226710796356, "learning_rate": 3.6902286983151737e-06, "loss": 0.0065, "reward": 1.7114062309265137, "reward_std": 0.6357433795928955, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.43015626072883606, "step": 2287 }, { "completion_length": 138.375, "epoch": 1.2241840556447299, "grad_norm": 0.9272077679634094, "kl": 0.22508302330970764, "learning_rate": 3.688859708678957e-06, "loss": 0.009, "reward": 1.6498124599456787, "reward_std": 0.9076617956161499, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4310625195503235, "step": 2288 }, { "completion_length": 143.84375, "epoch": 1.2247191011235956, "grad_norm": 0.5272965431213379, "kl": 0.16943445801734924, "learning_rate": 3.687490258217935e-06, "loss": 0.0068, "reward": 2.0087499618530273, "reward_std": 0.6421575546264648, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.44624999165534973, "step": 2289 }, { "completion_length": 109.03125, "epoch": 1.2252541466024611, "grad_norm": 74037.234375, "kl": 91.56483459472656, "learning_rate": 3.6861203474629335e-06, "loss": 3.6626, "reward": 2.4306249618530273, "reward_std": 0.786727249622345, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.49312499165534973, "step": 2290 }, { "completion_length": 127.84375, "epoch": 1.2257891920813269, "grad_norm": 0.9694074392318726, "kl": 0.20707395672798157, "learning_rate": 3.684749976944957e-06, "loss": 0.0083, "reward": 2.0677499771118164, "reward_std": 0.9026356935501099, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.4271249771118164, "step": 2291 }, { "completion_length": 138.71875, "epoch": 1.2263242375601926, "grad_norm": 0.628803551197052, "kl": 0.15451355278491974, "learning_rate": 3.683379147195185e-06, "loss": 0.0062, "reward": 2.0703125, "reward_std": 0.7895330190658569, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4765625, "step": 2292 }, { "completion_length": 146.3125, "epoch": 1.2268592830390583, "grad_norm": 0.940357506275177, "kl": 0.16508278250694275, "learning_rate": 3.68200785874498e-06, "loss": 0.0066, "reward": 1.7949062585830688, "reward_std": 0.6115150451660156, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46678125858306885, "step": 2293 }, { "completion_length": 98.46875, "epoch": 1.227394328517924, "grad_norm": 0.8726630210876465, "kl": 0.21511399745941162, "learning_rate": 3.680636112125878e-06, "loss": 0.0086, "reward": 2.711750030517578, "reward_std": 0.8193044662475586, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47737500071525574, "step": 2294 }, { "completion_length": 128.78125, "epoch": 1.2279293739967898, "grad_norm": 0.5071130990982056, "kl": 0.14742128551006317, "learning_rate": 3.679263907869596e-06, "loss": 0.0059, "reward": 1.94140625, "reward_std": 0.5862878561019897, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48828125, "step": 2295 }, { "completion_length": 144.875, "epoch": 1.2284644194756553, "grad_norm": 0.7736632823944092, "kl": 0.21175454556941986, "learning_rate": 3.6778912465080247e-06, "loss": 0.0085, "reward": 2.1971874237060547, "reward_std": 1.098656177520752, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.44718751311302185, "step": 2296 }, { "completion_length": 139.34375, "epoch": 1.228999464954521, "grad_norm": 0.8894461393356323, "kl": 0.18441133201122284, "learning_rate": 3.676518128573234e-06, "loss": 0.0074, "reward": 2.274531364440918, "reward_std": 1.3813624382019043, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4151562452316284, "step": 2297 }, { "completion_length": 127.28125, "epoch": 1.2295345104333868, "grad_norm": 1.513822078704834, "kl": 0.26902252435684204, "learning_rate": 3.6751445545974727e-06, "loss": 0.0108, "reward": 1.9994688034057617, "reward_std": 0.843029260635376, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46821874380111694, "step": 2298 }, { "completion_length": 141.34375, "epoch": 1.2300695559122525, "grad_norm": 1.4200348854064941, "kl": 0.15700018405914307, "learning_rate": 3.6737705251131628e-06, "loss": 0.0063, "reward": 1.9329376220703125, "reward_std": 1.0141139030456543, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.41731250286102295, "step": 2299 }, { "completion_length": 123.8125, "epoch": 1.2306046013911183, "grad_norm": 0.9444994926452637, "kl": 0.16933080554008484, "learning_rate": 3.672396040652905e-06, "loss": 0.0068, "reward": 2.033843755722046, "reward_std": 0.9628925323486328, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4713437259197235, "step": 2300 }, { "completion_length": 144.0625, "epoch": 1.231139646869984, "grad_norm": 16086571.0, "kl": 1345624.5, "learning_rate": 3.671021101749476e-06, "loss": 53824.9844, "reward": 1.3523125648498535, "reward_std": 0.9344874620437622, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.43043750524520874, "step": 2301 }, { "completion_length": 139.96875, "epoch": 1.2316746923488497, "grad_norm": 0.8058692216873169, "kl": 0.2169136255979538, "learning_rate": 3.669645708935827e-06, "loss": 0.0087, "reward": 1.702968716621399, "reward_std": 0.8771690130233765, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4060937464237213, "step": 2302 }, { "completion_length": 136.40625, "epoch": 1.2322097378277155, "grad_norm": 1.3642890453338623, "kl": 0.1881657838821411, "learning_rate": 3.668269862745089e-06, "loss": 0.0075, "reward": 1.4841874837875366, "reward_std": 0.6591209173202515, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4685624837875366, "step": 2303 }, { "completion_length": 151.3125, "epoch": 1.232744783306581, "grad_norm": 0.6927839517593384, "kl": 0.20702587068080902, "learning_rate": 3.6668935637105645e-06, "loss": 0.0083, "reward": 1.4048750400543213, "reward_std": 0.4595561623573303, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4361249804496765, "step": 2304 }, { "completion_length": 127.5, "epoch": 1.2332798287854467, "grad_norm": 0.816020667552948, "kl": 0.17187142372131348, "learning_rate": 3.6655168123657343e-06, "loss": 0.0069, "reward": 1.9929375648498535, "reward_std": 0.7104349136352539, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.43043750524520874, "step": 2305 }, { "completion_length": 143.25, "epoch": 1.2338148742643125, "grad_norm": 0.8255177736282349, "kl": 0.16737103462219238, "learning_rate": 3.6641396092442527e-06, "loss": 0.0067, "reward": 1.413656234741211, "reward_std": 0.4120520055294037, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.44490623474121094, "step": 2306 }, { "completion_length": 145.59375, "epoch": 1.2343499197431782, "grad_norm": 0.6259283423423767, "kl": 0.16370561718940735, "learning_rate": 3.662761954879951e-06, "loss": 0.0065, "reward": 1.9285626411437988, "reward_std": 0.7643470764160156, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4129374921321869, "step": 2307 }, { "completion_length": 136.75, "epoch": 1.234884965222044, "grad_norm": 1.1654947996139526, "kl": 0.22990946471691132, "learning_rate": 3.661383849806834e-06, "loss": 0.0092, "reward": 1.9389063119888306, "reward_std": 0.8859450817108154, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4701562523841858, "step": 2308 }, { "completion_length": 117.09375, "epoch": 1.2354200107009095, "grad_norm": 0.5511780381202698, "kl": 0.14868991076946259, "learning_rate": 3.660005294559081e-06, "loss": 0.0059, "reward": 1.8586249351501465, "reward_std": 0.8974405527114868, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.45237499475479126, "step": 2309 }, { "completion_length": 161.03125, "epoch": 1.2359550561797752, "grad_norm": 1.3002947568893433, "kl": 0.14267395436763763, "learning_rate": 3.658626289671048e-06, "loss": 0.0057, "reward": 1.1395000219345093, "reward_std": 0.801703155040741, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.3738749921321869, "step": 2310 }, { "completion_length": 142.09375, "epoch": 1.236490101658641, "grad_norm": 2.1317455768585205, "kl": 0.24574369192123413, "learning_rate": 3.657246835677262e-06, "loss": 0.0098, "reward": 1.522937536239624, "reward_std": 0.6090100407600403, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.47606250643730164, "step": 2311 }, { "completion_length": 134.0, "epoch": 1.2370251471375067, "grad_norm": 0.5643568634986877, "kl": 0.1635178029537201, "learning_rate": 3.655866933112428e-06, "loss": 0.0065, "reward": 2.265625, "reward_std": 0.5193343758583069, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.5, "step": 2312 }, { "completion_length": 136.25, "epoch": 1.2375601926163724, "grad_norm": 0.6920648813247681, "kl": 0.18556219339370728, "learning_rate": 3.65448658251142e-06, "loss": 0.0074, "reward": 2.2557811737060547, "reward_std": 0.7645730972290039, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.44328126311302185, "step": 2313 }, { "completion_length": 133.28125, "epoch": 1.2380952380952381, "grad_norm": 0.8972164988517761, "kl": 0.18222607672214508, "learning_rate": 3.6531057844092906e-06, "loss": 0.0073, "reward": 1.7901875972747803, "reward_std": 0.7735999822616577, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4308125078678131, "step": 2314 }, { "completion_length": 138.46875, "epoch": 1.2386302835741039, "grad_norm": 0.8870901465415955, "kl": 0.17768950760364532, "learning_rate": 3.6517245393412614e-06, "loss": 0.0071, "reward": 2.301562547683716, "reward_std": 0.9360629320144653, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.45781251788139343, "step": 2315 }, { "completion_length": 148.59375, "epoch": 1.2391653290529696, "grad_norm": 0.3622925579547882, "kl": 0.1234094649553299, "learning_rate": 3.650342847842732e-06, "loss": 0.0049, "reward": 1.6424686908721924, "reward_std": 0.40672755241394043, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.40809375047683716, "step": 2316 }, { "completion_length": 145.90625, "epoch": 1.2397003745318351, "grad_norm": 1.6179579496383667, "kl": 0.15075747668743134, "learning_rate": 3.64896071044927e-06, "loss": 0.006, "reward": 1.749343752861023, "reward_std": 0.7353832125663757, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.265625, "rewards/xmlcount_reward_func": 0.35871875286102295, "step": 2317 }, { "completion_length": 127.5625, "epoch": 1.2402354200107009, "grad_norm": 0.6549327969551086, "kl": 0.16587020456790924, "learning_rate": 3.6475781276966206e-06, "loss": 0.0066, "reward": 1.72265625, "reward_std": 0.7130071520805359, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48828125, "step": 2318 }, { "completion_length": 152.71875, "epoch": 1.2407704654895666, "grad_norm": 1.012118935585022, "kl": 0.19951941072940826, "learning_rate": 3.646195100120698e-06, "loss": 0.008, "reward": 1.9268125295639038, "reward_std": 0.47186046838760376, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.45806246995925903, "step": 2319 }, { "completion_length": 130.1875, "epoch": 1.2413055109684323, "grad_norm": 0.5987575054168701, "kl": 0.19137820601463318, "learning_rate": 3.6448116282575896e-06, "loss": 0.0077, "reward": 2.212156295776367, "reward_std": 0.3902488946914673, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4777812361717224, "step": 2320 }, { "completion_length": 105.75, "epoch": 1.241840556447298, "grad_norm": 1.518389105796814, "kl": 0.19245192408561707, "learning_rate": 3.643427712643558e-06, "loss": 0.0077, "reward": 1.608781337738037, "reward_std": 0.7165783643722534, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.45253121852874756, "step": 2321 }, { "completion_length": 133.59375, "epoch": 1.2423756019261638, "grad_norm": 1.7080758810043335, "kl": 0.17746183276176453, "learning_rate": 3.642043353815033e-06, "loss": 0.0071, "reward": 2.4177188873291016, "reward_std": 0.9928857684135437, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4489687383174896, "step": 2322 }, { "completion_length": 152.90625, "epoch": 1.2429106474050293, "grad_norm": 0.7196208238601685, "kl": 0.14882232248783112, "learning_rate": 3.6406585523086196e-06, "loss": 0.006, "reward": 1.7909998893737793, "reward_std": 0.6124814748764038, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.43162500858306885, "step": 2323 }, { "completion_length": 113.875, "epoch": 1.243445692883895, "grad_norm": 1.0371863842010498, "kl": 0.1716178059577942, "learning_rate": 3.6392733086610933e-06, "loss": 0.0069, "reward": 2.531125068664551, "reward_std": 0.4050799012184143, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.499875009059906, "step": 2324 }, { "completion_length": 163.875, "epoch": 1.2439807383627608, "grad_norm": 0.7980861663818359, "kl": 0.11834526807069778, "learning_rate": 3.637887623409401e-06, "loss": 0.0047, "reward": 0.9640624523162842, "reward_std": 0.698636531829834, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.1875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.35468751192092896, "step": 2325 }, { "completion_length": 146.6875, "epoch": 1.2445157838416265, "grad_norm": 1.3093029260635376, "kl": 0.1438561975955963, "learning_rate": 3.6365014970906614e-06, "loss": 0.0058, "reward": 0.9975625276565552, "reward_std": 0.4870831370353699, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.234375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4194375276565552, "step": 2326 }, { "completion_length": 125.0, "epoch": 1.2450508293204923, "grad_norm": 0.9531550407409668, "kl": 0.18336832523345947, "learning_rate": 3.6351149302421636e-06, "loss": 0.0073, "reward": 2.069187641143799, "reward_std": 0.7207481861114502, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4441874921321869, "step": 2327 }, { "completion_length": 109.3125, "epoch": 1.245585874799358, "grad_norm": 1.740848422050476, "kl": 0.18169450759887695, "learning_rate": 3.6337279234013664e-06, "loss": 0.0073, "reward": 2.082124948501587, "reward_std": 0.6846818327903748, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4727500081062317, "step": 2328 }, { "completion_length": 134.90625, "epoch": 1.2461209202782237, "grad_norm": 0.7195886373519897, "kl": 0.17955444753170013, "learning_rate": 3.632340477105901e-06, "loss": 0.0072, "reward": 2.0789999961853027, "reward_std": 0.7164689898490906, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.43837499618530273, "step": 2329 }, { "completion_length": 127.6875, "epoch": 1.2466559657570893, "grad_norm": 1.3524320125579834, "kl": 0.21242564916610718, "learning_rate": 3.630952591893568e-06, "loss": 0.0085, "reward": 2.1586251258850098, "reward_std": 0.8780450820922852, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4242500066757202, "step": 2330 }, { "completion_length": 94.34375, "epoch": 1.247191011235955, "grad_norm": 1.6174792051315308, "kl": 0.22988852858543396, "learning_rate": 3.629564268302338e-06, "loss": 0.0092, "reward": 2.19921875, "reward_std": 0.5363776087760925, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.49609375, "step": 2331 }, { "completion_length": 127.0, "epoch": 1.2477260567148207, "grad_norm": 0.9476041197776794, "kl": 0.14408338069915771, "learning_rate": 3.6281755068703527e-06, "loss": 0.0058, "reward": 1.878812551498413, "reward_std": 1.0135376453399658, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4256874918937683, "step": 2332 }, { "completion_length": 126.625, "epoch": 1.2482611021936865, "grad_norm": 0.5678253173828125, "kl": 0.17647889256477356, "learning_rate": 3.626786308135922e-06, "loss": 0.0071, "reward": 2.126500129699707, "reward_std": 0.6077790260314941, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4858750104904175, "step": 2333 }, { "completion_length": 105.375, "epoch": 1.2487961476725522, "grad_norm": 2.5153567790985107, "kl": 0.41439929604530334, "learning_rate": 3.6253966726375255e-06, "loss": 0.0166, "reward": 2.6684062480926514, "reward_std": 0.7156611084938049, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48090624809265137, "step": 2334 }, { "completion_length": 144.40625, "epoch": 1.249331193151418, "grad_norm": 7282564.5, "kl": 92441.6328125, "learning_rate": 3.624006600913814e-06, "loss": 3697.6643, "reward": 2.2400624752044678, "reward_std": 0.929480791091919, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.45881250500679016, "step": 2335 }, { "completion_length": 135.34375, "epoch": 1.2498662386302835, "grad_norm": 0.633622407913208, "kl": 0.23531758785247803, "learning_rate": 3.6226160935036052e-06, "loss": 0.0094, "reward": 2.66796875, "reward_std": 0.48610028624534607, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.49609375, "step": 2336 }, { "completion_length": 139.5, "epoch": 1.2504012841091492, "grad_norm": 2.165390968322754, "kl": 0.20855821669101715, "learning_rate": 3.621225150945886e-06, "loss": 0.0083, "reward": 1.5634374618530273, "reward_std": 0.9608262777328491, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.43843749165534973, "step": 2337 }, { "completion_length": 133.21875, "epoch": 1.250936329588015, "grad_norm": 1.2868105173110962, "kl": 0.16091537475585938, "learning_rate": 3.619833773779813e-06, "loss": 0.0064, "reward": 2.3359687328338623, "reward_std": 1.090752124786377, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4453437328338623, "step": 2338 }, { "completion_length": 143.65625, "epoch": 1.2514713750668807, "grad_norm": 0.859196662902832, "kl": 0.18997779488563538, "learning_rate": 3.6184419625447094e-06, "loss": 0.0076, "reward": 1.5501562356948853, "reward_std": 0.6415876150131226, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.42515623569488525, "step": 2339 }, { "completion_length": 136.4375, "epoch": 1.2520064205457464, "grad_norm": 1.3200353384017944, "kl": 0.19985446333885193, "learning_rate": 3.61704971778007e-06, "loss": 0.008, "reward": 1.6046562194824219, "reward_std": 0.49957650899887085, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47965624928474426, "step": 2340 }, { "completion_length": 131.5, "epoch": 1.2525414660246121, "grad_norm": 0.9325370192527771, "kl": 0.2152131199836731, "learning_rate": 3.615657040025554e-06, "loss": 0.0086, "reward": 2.1273436546325684, "reward_std": 0.9840123653411865, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4398437440395355, "step": 2341 }, { "completion_length": 117.78125, "epoch": 1.2530765115034779, "grad_norm": 0.9202417731285095, "kl": 0.19222530722618103, "learning_rate": 3.61426392982099e-06, "loss": 0.0077, "reward": 2.421875, "reward_std": 0.839560866355896, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.5, "step": 2342 }, { "completion_length": 150.6875, "epoch": 1.2536115569823436, "grad_norm": 1.1845450401306152, "kl": 0.14975741505622864, "learning_rate": 3.6128703877063755e-06, "loss": 0.006, "reward": 1.6647813320159912, "reward_std": 0.8978488445281982, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.43040624260902405, "step": 2343 }, { "completion_length": 137.84375, "epoch": 1.2541466024612091, "grad_norm": 0.6880291104316711, "kl": 0.14331218600273132, "learning_rate": 3.6114764142218717e-06, "loss": 0.0057, "reward": 2.1425623893737793, "reward_std": 0.6380910873413086, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.47068750858306885, "step": 2344 }, { "completion_length": 127.40625, "epoch": 1.2546816479400749, "grad_norm": 0.40335988998413086, "kl": 0.15547719597816467, "learning_rate": 3.6100820099078116e-06, "loss": 0.0062, "reward": 2.484375, "reward_std": 0.2414703369140625, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 2345 }, { "completion_length": 138.78125, "epoch": 1.2552166934189406, "grad_norm": 1.7239311933517456, "kl": 0.18731310963630676, "learning_rate": 3.608687175304693e-06, "loss": 0.0075, "reward": 1.7283437252044678, "reward_std": 0.9707610607147217, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.44709375500679016, "step": 2346 }, { "completion_length": 125.84375, "epoch": 1.2557517388978063, "grad_norm": 0.8530607223510742, "kl": 0.17940640449523926, "learning_rate": 3.607291910953179e-06, "loss": 0.0072, "reward": 2.7722811698913574, "reward_std": 0.8546769618988037, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4910312592983246, "step": 2347 }, { "completion_length": 124.375, "epoch": 1.256286784376672, "grad_norm": 0.5888671875, "kl": 0.1679450273513794, "learning_rate": 3.6058962173941026e-06, "loss": 0.0067, "reward": 2.147624969482422, "reward_std": 0.6871175169944763, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46012499928474426, "step": 2348 }, { "completion_length": 146.65625, "epoch": 1.2568218298555376, "grad_norm": 1.9854471683502197, "kl": 0.16336500644683838, "learning_rate": 3.60450009516846e-06, "loss": 0.0065, "reward": 1.5973750352859497, "reward_std": 0.7531864643096924, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.39424997568130493, "step": 2349 }, { "completion_length": 114.15625, "epoch": 1.2573568753344033, "grad_norm": 1.1607989072799683, "kl": 0.28974300622940063, "learning_rate": 3.603103544817415e-06, "loss": 0.0116, "reward": 1.5688124895095825, "reward_std": 0.4319048523902893, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4906874895095825, "step": 2350 }, { "completion_length": 139.0625, "epoch": 1.257891920813269, "grad_norm": 1.4966713190078735, "kl": 0.1573982834815979, "learning_rate": 3.601706566882299e-06, "loss": 0.0063, "reward": 1.6754686832427979, "reward_std": 0.8665360808372498, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4254687428474426, "step": 2351 }, { "completion_length": 169.59375, "epoch": 1.2584269662921348, "grad_norm": 0.617010772228241, "kl": 0.16376137733459473, "learning_rate": 3.6003091619046055e-06, "loss": 0.0066, "reward": 1.0692499876022339, "reward_std": 0.5523795485496521, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.3817499876022339, "step": 2352 }, { "completion_length": 133.78125, "epoch": 1.2589620117710005, "grad_norm": 0.8033294081687927, "kl": 0.15393143892288208, "learning_rate": 3.5989113304259955e-06, "loss": 0.0062, "reward": 1.8128750324249268, "reward_std": 0.7116576433181763, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.43787500262260437, "step": 2353 }, { "completion_length": 133.09375, "epoch": 1.2594970572498663, "grad_norm": 1.5525676012039185, "kl": 0.17309343814849854, "learning_rate": 3.597513072988296e-06, "loss": 0.0069, "reward": 2.3518123626708984, "reward_std": 1.0945756435394287, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4768125116825104, "step": 2354 }, { "completion_length": 108.375, "epoch": 1.260032102728732, "grad_norm": 3.9953196048736572, "kl": 0.33825328946113586, "learning_rate": 3.5961143901334984e-06, "loss": 0.0135, "reward": 2.331906318664551, "reward_std": 0.3670504689216614, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.488156259059906, "step": 2355 }, { "completion_length": 108.34375, "epoch": 1.2605671482075977, "grad_norm": 0.8514541387557983, "kl": 0.20910197496414185, "learning_rate": 3.5947152824037584e-06, "loss": 0.0084, "reward": 2.257312536239624, "reward_std": 0.7677892446517944, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46043750643730164, "step": 2356 }, { "completion_length": 107.96875, "epoch": 1.2611021936864635, "grad_norm": 1.1903530359268188, "kl": 0.22489510476589203, "learning_rate": 3.593315750341398e-06, "loss": 0.009, "reward": 2.139531373977661, "reward_std": 0.8329043388366699, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4832812547683716, "step": 2357 }, { "completion_length": 139.0, "epoch": 1.261637239165329, "grad_norm": 0.682449996471405, "kl": 0.15817248821258545, "learning_rate": 3.5919157944889017e-06, "loss": 0.0063, "reward": 2.182593822479248, "reward_std": 0.8580899834632874, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.43259376287460327, "step": 2358 }, { "completion_length": 120.96875, "epoch": 1.2621722846441947, "grad_norm": 0.6306414604187012, "kl": 0.1990286111831665, "learning_rate": 3.590515415388919e-06, "loss": 0.008, "reward": 2.481156349182129, "reward_std": 0.9639334082603455, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48115622997283936, "step": 2359 }, { "completion_length": 152.25, "epoch": 1.2627073301230605, "grad_norm": 1.957392692565918, "kl": 0.2991892695426941, "learning_rate": 3.5891146135842647e-06, "loss": 0.012, "reward": 1.629906177520752, "reward_std": 0.3553221821784973, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.28125, "rewards/xmlcount_reward_func": 0.4111562669277191, "step": 2360 }, { "completion_length": 134.75, "epoch": 1.2632423756019262, "grad_norm": 0.6337845921516418, "kl": 0.18050725758075714, "learning_rate": 3.587713389617916e-06, "loss": 0.0072, "reward": 2.1147186756134033, "reward_std": 0.4919167160987854, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4272187352180481, "step": 2361 }, { "completion_length": 111.03125, "epoch": 1.263777421080792, "grad_norm": 0.7906578183174133, "kl": 0.170087531208992, "learning_rate": 3.5863117440330136e-06, "loss": 0.0068, "reward": 1.453125, "reward_std": 0.10205793380737305, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.5, "step": 2362 }, { "completion_length": 105.25, "epoch": 1.2643124665596575, "grad_norm": 0.867626428604126, "kl": 0.19083137810230255, "learning_rate": 3.5849096773728627e-06, "loss": 0.0076, "reward": 1.4015624523162842, "reward_std": 0.37661683559417725, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.49531251192092896, "step": 2363 }, { "completion_length": 159.5625, "epoch": 1.2648475120385232, "grad_norm": 1.0094345808029175, "kl": 0.18817973136901855, "learning_rate": 3.5835071901809314e-06, "loss": 0.0075, "reward": 1.460687518119812, "reward_std": 0.8511545658111572, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.21875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.366937518119812, "step": 2364 }, { "completion_length": 130.40625, "epoch": 1.265382557517389, "grad_norm": 0.8619552850723267, "kl": 0.20055519044399261, "learning_rate": 3.58210428300085e-06, "loss": 0.008, "reward": 1.7290624380111694, "reward_std": 0.8412307500839233, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4478124976158142, "step": 2365 }, { "completion_length": 153.03125, "epoch": 1.2659176029962547, "grad_norm": 0.9383620023727417, "kl": 0.14363020658493042, "learning_rate": 3.5807009563764133e-06, "loss": 0.0057, "reward": 1.7366249561309814, "reward_std": 0.720727801322937, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4241250157356262, "step": 2366 }, { "completion_length": 121.15625, "epoch": 1.2664526484751204, "grad_norm": 3.1051642894744873, "kl": 0.2926304042339325, "learning_rate": 3.579297210851576e-06, "loss": 0.0117, "reward": 1.5003124475479126, "reward_std": 0.5907131433486938, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4846875071525574, "step": 2367 }, { "completion_length": 123.5625, "epoch": 1.2669876939539861, "grad_norm": 0.8098077178001404, "kl": 0.1809631735086441, "learning_rate": 3.5778930469704576e-06, "loss": 0.0072, "reward": 2.3063125610351562, "reward_std": 0.6742262244224548, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4781875014305115, "step": 2368 }, { "completion_length": 105.34375, "epoch": 1.2675227394328519, "grad_norm": 1.8654756546020508, "kl": 0.2019227147102356, "learning_rate": 3.57648846527734e-06, "loss": 0.0081, "reward": 2.3007187843322754, "reward_std": 0.6028456687927246, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.488218754529953, "step": 2369 }, { "completion_length": 113.125, "epoch": 1.2680577849117176, "grad_norm": 2.7473669052124023, "kl": 0.19288745522499084, "learning_rate": 3.575083466316664e-06, "loss": 0.0077, "reward": 2.099249839782715, "reward_std": 1.0562782287597656, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4586249887943268, "step": 2370 }, { "completion_length": 118.6875, "epoch": 1.2685928303905831, "grad_norm": 0.4706806242465973, "kl": 0.19489340484142303, "learning_rate": 3.5736780506330356e-06, "loss": 0.0078, "reward": 2.348062515258789, "reward_std": 0.7209644317626953, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4730624854564667, "step": 2371 }, { "completion_length": 159.84375, "epoch": 1.2691278758694489, "grad_norm": 6.033718109130859, "kl": 0.4135114252567291, "learning_rate": 3.57227221877122e-06, "loss": 0.0165, "reward": 1.4202187061309814, "reward_std": 0.8405505418777466, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.28125, "rewards/xmlcount_reward_func": 0.3733437657356262, "step": 2372 }, { "completion_length": 112.34375, "epoch": 1.2696629213483146, "grad_norm": 0.6738152503967285, "kl": 0.2201249897480011, "learning_rate": 3.570865971276144e-06, "loss": 0.0088, "reward": 2.335625171661377, "reward_std": 0.8923088312149048, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4762499928474426, "step": 2373 }, { "completion_length": 127.34375, "epoch": 1.2701979668271803, "grad_norm": 0.7667103409767151, "kl": 0.18756550550460815, "learning_rate": 3.5694593086928984e-06, "loss": 0.0075, "reward": 2.4760000705718994, "reward_std": 0.5706050992012024, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.49162501096725464, "step": 2374 }, { "completion_length": 127.78125, "epoch": 1.270733012306046, "grad_norm": 1.9036457538604736, "kl": 0.2660052478313446, "learning_rate": 3.568052231566731e-06, "loss": 0.0106, "reward": 1.9453125, "reward_std": 0.46121490001678467, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4453125, "step": 2375 }, { "completion_length": 127.21875, "epoch": 1.2712680577849116, "grad_norm": 0.8949145078659058, "kl": 0.18689164519309998, "learning_rate": 3.566644740443051e-06, "loss": 0.0075, "reward": 1.5709062814712524, "reward_std": 0.889336347579956, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46153125166893005, "step": 2376 }, { "completion_length": 118.78125, "epoch": 1.2718031032637773, "grad_norm": 1.2417627573013306, "kl": 0.21463528275489807, "learning_rate": 3.5652368358674304e-06, "loss": 0.0086, "reward": 2.5032501220703125, "reward_std": 0.718734860420227, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48762500286102295, "step": 2377 }, { "completion_length": 131.34375, "epoch": 1.272338148742643, "grad_norm": 1.5372283458709717, "kl": 0.21846160292625427, "learning_rate": 3.5638285183856e-06, "loss": 0.0087, "reward": 1.859375, "reward_std": 0.7073007225990295, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.5, "step": 2378 }, { "completion_length": 126.15625, "epoch": 1.2728731942215088, "grad_norm": 1.0051441192626953, "kl": 0.16365930438041687, "learning_rate": 3.5624197885434493e-06, "loss": 0.0065, "reward": 1.6515936851501465, "reward_std": 0.7223238348960876, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47971874475479126, "step": 2379 }, { "completion_length": 122.25, "epoch": 1.2734082397003745, "grad_norm": 1.041358232498169, "kl": 0.24340428411960602, "learning_rate": 3.5610106468870304e-06, "loss": 0.0097, "reward": 2.3770625591278076, "reward_std": 0.8274153470993042, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.42393749952316284, "step": 2380 }, { "completion_length": 119.375, "epoch": 1.2739432851792403, "grad_norm": 0.6610001921653748, "kl": 0.16252915561199188, "learning_rate": 3.5596010939625524e-06, "loss": 0.0065, "reward": 2.765625, "reward_std": 0.7260611057281494, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.5, "step": 2381 }, { "completion_length": 135.59375, "epoch": 1.274478330658106, "grad_norm": 1.8681538105010986, "kl": 0.18852341175079346, "learning_rate": 3.5581911303163857e-06, "loss": 0.0075, "reward": 1.955437421798706, "reward_std": 0.8967754244804382, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47106248140335083, "step": 2382 }, { "completion_length": 114.4375, "epoch": 1.2750133761369717, "grad_norm": 1.833168625831604, "kl": 0.34576350450515747, "learning_rate": 3.5567807564950595e-06, "loss": 0.0138, "reward": 3.015718936920166, "reward_std": 0.588125467300415, "rewards/correctness_reward_func": 1.5625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4844687581062317, "step": 2383 }, { "completion_length": 126.4375, "epoch": 1.2755484216158375, "grad_norm": 4.9969305992126465, "kl": 0.17986330389976501, "learning_rate": 3.555369973045262e-06, "loss": 0.0072, "reward": 2.22265625, "reward_std": 1.0088021755218506, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48828125, "step": 2384 }, { "completion_length": 147.5, "epoch": 1.276083467094703, "grad_norm": 2.12459659576416, "kl": 0.11484839022159576, "learning_rate": 3.5539587805138387e-06, "loss": 0.0046, "reward": 2.2824063301086426, "reward_std": 0.9827622771263123, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4074062705039978, "step": 2385 }, { "completion_length": 130.71875, "epoch": 1.2766185125735687, "grad_norm": 1.2535592317581177, "kl": 0.1569872498512268, "learning_rate": 3.552547179447795e-06, "loss": 0.0063, "reward": 2.632406234741211, "reward_std": 0.9774359464645386, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.49178123474121094, "step": 2386 }, { "completion_length": 116.59375, "epoch": 1.2771535580524345, "grad_norm": 0.7306413650512695, "kl": 0.1848265826702118, "learning_rate": 3.551135170394296e-06, "loss": 0.0074, "reward": 2.591125011444092, "reward_std": 0.4797559380531311, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4973750114440918, "step": 2387 }, { "completion_length": 134.9375, "epoch": 1.2776886035313002, "grad_norm": 1.992546796798706, "kl": 0.16557496786117554, "learning_rate": 3.549722753900662e-06, "loss": 0.0066, "reward": 2.2278125286102295, "reward_std": 0.9156724214553833, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4309374988079071, "step": 2388 }, { "completion_length": 126.21875, "epoch": 1.278223649010166, "grad_norm": 3.905050754547119, "kl": 0.33283886313438416, "learning_rate": 3.5483099305143732e-06, "loss": 0.0133, "reward": 1.6045312881469727, "reward_std": 0.9888400435447693, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4482812285423279, "step": 2389 }, { "completion_length": 109.15625, "epoch": 1.2787586944890315, "grad_norm": 1.2818254232406616, "kl": 0.23708774149417877, "learning_rate": 3.546896700783067e-06, "loss": 0.0095, "reward": 2.1284687519073486, "reward_std": 0.5021796226501465, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.440968781709671, "step": 2390 }, { "completion_length": 134.59375, "epoch": 1.2792937399678972, "grad_norm": 0.9697751402854919, "kl": 0.17245739698410034, "learning_rate": 3.5454830652545373e-06, "loss": 0.0069, "reward": 1.9690937995910645, "reward_std": 0.6360250115394592, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4534687399864197, "step": 2391 }, { "completion_length": 125.28125, "epoch": 1.279828785446763, "grad_norm": 13.764908790588379, "kl": 0.2325672209262848, "learning_rate": 3.5440690244767385e-06, "loss": 0.0093, "reward": 2.5546250343322754, "reward_std": 0.8489419221878052, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.476500004529953, "step": 2392 }, { "completion_length": 135.78125, "epoch": 1.2803638309256287, "grad_norm": 3.030074119567871, "kl": 0.20727288722991943, "learning_rate": 3.5426545789977783e-06, "loss": 0.0083, "reward": 1.9680312871932983, "reward_std": 0.9582469463348389, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.40553125739097595, "step": 2393 }, { "completion_length": 157.59375, "epoch": 1.2808988764044944, "grad_norm": 0.5212138295173645, "kl": 0.12256555259227753, "learning_rate": 3.541239729365924e-06, "loss": 0.0049, "reward": 1.4152500629425049, "reward_std": 0.5780212879180908, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4465000033378601, "step": 2394 }, { "completion_length": 116.46875, "epoch": 1.2814339218833601, "grad_norm": 70.594970703125, "kl": 27.83607292175293, "learning_rate": 3.5398244761295976e-06, "loss": 1.1134, "reward": 2.1606874465942383, "reward_std": 1.07432222366333, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.45756250619888306, "step": 2395 }, { "completion_length": 126.71875, "epoch": 1.2819689673622259, "grad_norm": 0.6901609897613525, "kl": 0.17116107046604156, "learning_rate": 3.5384088198373765e-06, "loss": 0.0068, "reward": 2.227750062942505, "reward_std": 0.9576001167297363, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4777500033378601, "step": 2396 }, { "completion_length": 139.53125, "epoch": 1.2825040128410916, "grad_norm": 0.5109051465988159, "kl": 0.13548269867897034, "learning_rate": 3.5369927610380005e-06, "loss": 0.0054, "reward": 2.2578125, "reward_std": 0.5549706816673279, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4765625, "step": 2397 }, { "completion_length": 130.71875, "epoch": 1.2830390583199571, "grad_norm": 1.6064082384109497, "kl": 0.13563494384288788, "learning_rate": 3.5355763002803582e-06, "loss": 0.0054, "reward": 2.5260000228881836, "reward_std": 0.7307578325271606, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4166250228881836, "step": 2398 }, { "completion_length": 123.25, "epoch": 1.2835741037988229, "grad_norm": 1.1342253684997559, "kl": 0.18063810467720032, "learning_rate": 3.5341594381134973e-06, "loss": 0.0072, "reward": 2.0506248474121094, "reward_std": 0.5161914229393005, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.4881249964237213, "step": 2399 }, { "completion_length": 142.4375, "epoch": 1.2841091492776886, "grad_norm": 0.8634727001190186, "kl": 0.2057059407234192, "learning_rate": 3.532742175086621e-06, "loss": 0.0082, "reward": 2.613156318664551, "reward_std": 1.2703399658203125, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.441281259059906, "step": 2400 }, { "completion_length": 113.9375, "epoch": 1.2846441947565543, "grad_norm": 1.0015486478805542, "kl": 0.18224114179611206, "learning_rate": 3.531324511749088e-06, "loss": 0.0073, "reward": 2.213937520980835, "reward_std": 0.7758774161338806, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4951874911785126, "step": 2401 }, { "completion_length": 141.46875, "epoch": 1.28517924023542, "grad_norm": 1.376713514328003, "kl": 0.19722548127174377, "learning_rate": 3.5299064486504107e-06, "loss": 0.0079, "reward": 1.617593765258789, "reward_std": 0.8713535070419312, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.44571876525878906, "step": 2402 }, { "completion_length": 126.65625, "epoch": 1.2857142857142856, "grad_norm": 0.9298856854438782, "kl": 0.17659175395965576, "learning_rate": 3.5284879863402587e-06, "loss": 0.0071, "reward": 2.2411251068115234, "reward_std": 0.7981414198875427, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4754999876022339, "step": 2403 }, { "completion_length": 129.8125, "epoch": 1.2862493311931513, "grad_norm": 1.446305274963379, "kl": 0.17272770404815674, "learning_rate": 3.527069125368455e-06, "loss": 0.0069, "reward": 1.91015625, "reward_std": 0.6713187098503113, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48828125, "step": 2404 }, { "completion_length": 159.4375, "epoch": 1.286784376672017, "grad_norm": 3.0329623222351074, "kl": 0.14171457290649414, "learning_rate": 3.5256498662849758e-06, "loss": 0.0057, "reward": 1.578031301498413, "reward_std": 0.7260315418243408, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.3905312418937683, "step": 2405 }, { "completion_length": 106.34375, "epoch": 1.2873194221508828, "grad_norm": 1.1629353761672974, "kl": 0.3035946488380432, "learning_rate": 3.5242302096399557e-06, "loss": 0.0121, "reward": 2.1016249656677246, "reward_std": 0.7031369209289551, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.476624995470047, "step": 2406 }, { "completion_length": 122.59375, "epoch": 1.2878544676297485, "grad_norm": 0.7217945456504822, "kl": 0.21208591759204865, "learning_rate": 3.522810155983678e-06, "loss": 0.0085, "reward": 2.2691564559936523, "reward_std": 0.7236884236335754, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.48790624737739563, "step": 2407 }, { "completion_length": 117.59375, "epoch": 1.2883895131086143, "grad_norm": 0.5622185468673706, "kl": 0.17716068029403687, "learning_rate": 3.521389705866586e-06, "loss": 0.0071, "reward": 2.234375, "reward_std": 0.507104218006134, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 2408 }, { "completion_length": 131.96875, "epoch": 1.28892455858748, "grad_norm": 11.712928771972656, "kl": 0.5548161268234253, "learning_rate": 3.51996885983927e-06, "loss": 0.0222, "reward": 2.312906265258789, "reward_std": 0.7946093082427979, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.45353126525878906, "step": 2409 }, { "completion_length": 134.40625, "epoch": 1.2894596040663457, "grad_norm": 1.9557212591171265, "kl": 0.5262947082519531, "learning_rate": 3.518547618452479e-06, "loss": 0.0211, "reward": 2.2571873664855957, "reward_std": 0.7575238943099976, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47593751549720764, "step": 2410 }, { "completion_length": 118.03125, "epoch": 1.2899946495452115, "grad_norm": 0.7341828346252441, "kl": 0.16367721557617188, "learning_rate": 3.517125982257113e-06, "loss": 0.0065, "reward": 2.468625068664551, "reward_std": 0.5976023077964783, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.499875009059906, "step": 2411 }, { "completion_length": 108.90625, "epoch": 1.290529695024077, "grad_norm": 1.4613317251205444, "kl": 0.3621910512447357, "learning_rate": 3.5157039518042257e-06, "loss": 0.0145, "reward": 1.7265625, "reward_std": 0.6532484889030457, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4765625, "step": 2412 }, { "completion_length": 136.78125, "epoch": 1.2910647405029427, "grad_norm": 2.364315986633301, "kl": 0.17991144955158234, "learning_rate": 3.5142815276450228e-06, "loss": 0.0072, "reward": 2.003406286239624, "reward_std": 0.4948708415031433, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.44090625643730164, "step": 2413 }, { "completion_length": 127.15625, "epoch": 1.2915997859818085, "grad_norm": 8.49045181274414, "kl": 1.085868000984192, "learning_rate": 3.5128587103308644e-06, "loss": 0.0434, "reward": 1.6547499895095825, "reward_std": 0.5055168867111206, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4828749895095825, "step": 2414 }, { "completion_length": 132.03125, "epoch": 1.2921348314606742, "grad_norm": 0.7716643214225769, "kl": 0.17592331767082214, "learning_rate": 3.5114355004132593e-06, "loss": 0.007, "reward": 2.16015625, "reward_std": 0.5449114441871643, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48828125, "step": 2415 }, { "completion_length": 124.46875, "epoch": 1.29266987693954, "grad_norm": 1.0458025932312012, "kl": 0.19750294089317322, "learning_rate": 3.5100118984438737e-06, "loss": 0.0079, "reward": 2.6600000858306885, "reward_std": 0.7022399306297302, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4724999964237213, "step": 2416 }, { "completion_length": 124.15625, "epoch": 1.2932049224184055, "grad_norm": 1.0994457006454468, "kl": 0.17047257721424103, "learning_rate": 3.508587904974522e-06, "loss": 0.0068, "reward": 2.091249942779541, "reward_std": 0.8001350164413452, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4818750023841858, "step": 2417 }, { "completion_length": 140.96875, "epoch": 1.2937399678972712, "grad_norm": 1.2535381317138672, "kl": 0.18798360228538513, "learning_rate": 3.507163520557171e-06, "loss": 0.0075, "reward": 2.2735939025878906, "reward_std": 0.6882376670837402, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4767187535762787, "step": 2418 }, { "completion_length": 132.84375, "epoch": 1.294275013376137, "grad_norm": 0.971892774105072, "kl": 0.18575552105903625, "learning_rate": 3.505738745743938e-06, "loss": 0.0074, "reward": 1.8278124332427979, "reward_std": 0.5233705043792725, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4996874928474426, "step": 2419 }, { "completion_length": 136.59375, "epoch": 1.2948100588550027, "grad_norm": 1.578774094581604, "kl": 0.28327763080596924, "learning_rate": 3.504313581087096e-06, "loss": 0.0113, "reward": 2.1012187004089355, "reward_std": 1.0144597291946411, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.28125, "rewards/xmlcount_reward_func": 0.39809373021125793, "step": 2420 }, { "completion_length": 147.9375, "epoch": 1.2953451043338684, "grad_norm": 1.2698822021484375, "kl": 0.2589702606201172, "learning_rate": 3.502888027139065e-06, "loss": 0.0104, "reward": 1.8405624628067017, "reward_std": 0.8476513624191284, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.43431249260902405, "step": 2421 }, { "completion_length": 118.6875, "epoch": 1.2958801498127341, "grad_norm": 2.675431966781616, "kl": 0.19257953763008118, "learning_rate": 3.5014620844524155e-06, "loss": 0.0077, "reward": 2.140625, "reward_std": 0.5791596174240112, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.5, "step": 2422 }, { "completion_length": 137.15625, "epoch": 1.2964151952915999, "grad_norm": 0.7802248001098633, "kl": 0.16949838399887085, "learning_rate": 3.5000357535798723e-06, "loss": 0.0068, "reward": 1.6599688529968262, "reward_std": 0.6020640134811401, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.456843763589859, "step": 2423 }, { "completion_length": 112.90625, "epoch": 1.2969502407704656, "grad_norm": 1.0476293563842773, "kl": 0.17581908404827118, "learning_rate": 3.4986090350743073e-06, "loss": 0.007, "reward": 2.7787814140319824, "reward_std": 0.9432874917984009, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4819062352180481, "step": 2424 }, { "completion_length": 142.09375, "epoch": 1.2974852862493311, "grad_norm": 0.6261356472969055, "kl": 0.13683180510997772, "learning_rate": 3.4971819294887445e-06, "loss": 0.0055, "reward": 1.6379375457763672, "reward_std": 0.9130970239639282, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4504374861717224, "step": 2425 }, { "completion_length": 122.25, "epoch": 1.2980203317281969, "grad_norm": 1.373007893562317, "kl": 0.21701711416244507, "learning_rate": 3.4957544373763573e-06, "loss": 0.0087, "reward": 1.8909062147140503, "reward_std": 1.1423192024230957, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46903127431869507, "step": 2426 }, { "completion_length": 121.40625, "epoch": 1.2985553772070626, "grad_norm": 1.0216388702392578, "kl": 0.1791382133960724, "learning_rate": 3.4943265592904692e-06, "loss": 0.0072, "reward": 2.507687568664551, "reward_std": 1.103846549987793, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.476437509059906, "step": 2427 }, { "completion_length": 138.4375, "epoch": 1.2990904226859283, "grad_norm": 5.9407572746276855, "kl": 0.19492317736148834, "learning_rate": 3.4928982957845527e-06, "loss": 0.0078, "reward": 2.686781406402588, "reward_std": 0.8754754662513733, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46803125739097595, "step": 2428 }, { "completion_length": 99.28125, "epoch": 1.299625468164794, "grad_norm": 0.5904544591903687, "kl": 0.16158851981163025, "learning_rate": 3.4914696474122302e-06, "loss": 0.0065, "reward": 2.286437511444092, "reward_std": 0.4224071502685547, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4895625114440918, "step": 2429 }, { "completion_length": 132.9375, "epoch": 1.3001605136436596, "grad_norm": 1.2978694438934326, "kl": 0.20414510369300842, "learning_rate": 3.4900406147272724e-06, "loss": 0.0082, "reward": 1.4631249904632568, "reward_std": 0.4599001407623291, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47874999046325684, "step": 2430 }, { "completion_length": 130.90625, "epoch": 1.3006955591225253, "grad_norm": 0.711818516254425, "kl": 0.15310700237751007, "learning_rate": 3.488611198283601e-06, "loss": 0.0061, "reward": 2.489406108856201, "reward_std": 0.8160014748573303, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4269062578678131, "step": 2431 }, { "completion_length": 118.40625, "epoch": 1.301230604601391, "grad_norm": 0.763363242149353, "kl": 0.20620205998420715, "learning_rate": 3.487181398635286e-06, "loss": 0.0082, "reward": 2.12109375, "reward_std": 0.7269863486289978, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.49609375, "step": 2432 }, { "completion_length": 128.1875, "epoch": 1.3017656500802568, "grad_norm": 0.8001653552055359, "kl": 0.1847931146621704, "learning_rate": 3.4857512163365425e-06, "loss": 0.0074, "reward": 2.1511874198913574, "reward_std": 0.6727146506309509, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4793125092983246, "step": 2433 }, { "completion_length": 120.53125, "epoch": 1.3023006955591225, "grad_norm": 0.8695458173751831, "kl": 0.16786065697669983, "learning_rate": 3.4843206519417383e-06, "loss": 0.0067, "reward": 2.035749912261963, "reward_std": 0.5574122667312622, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.48887500166893005, "step": 2434 }, { "completion_length": 124.03125, "epoch": 1.3028357410379883, "grad_norm": 0.9741595387458801, "kl": 0.2285858392715454, "learning_rate": 3.4828897060053874e-06, "loss": 0.0091, "reward": 1.7544374465942383, "reward_std": 1.000756025314331, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.44193750619888306, "step": 2435 }, { "completion_length": 140.65625, "epoch": 1.303370786516854, "grad_norm": 0.7739159464836121, "kl": 0.1665641963481903, "learning_rate": 3.4814583790821515e-06, "loss": 0.0067, "reward": 2.0687499046325684, "reward_std": 1.0493671894073486, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4750000238418579, "step": 2436 }, { "completion_length": 103.8125, "epoch": 1.3039058319957197, "grad_norm": 9.551468849182129, "kl": 0.29867202043533325, "learning_rate": 3.480026671726841e-06, "loss": 0.0119, "reward": 2.374875068664551, "reward_std": 0.7742522954940796, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.499875009059906, "step": 2437 }, { "completion_length": 139.03125, "epoch": 1.3044408774745853, "grad_norm": 1.2690339088439941, "kl": 0.28655385971069336, "learning_rate": 3.4785945844944126e-06, "loss": 0.0115, "reward": 1.5517812967300415, "reward_std": 0.7687050104141235, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4736562669277191, "step": 2438 }, { "completion_length": 118.0625, "epoch": 1.304975922953451, "grad_norm": 0.9740299582481384, "kl": 0.21583984792232513, "learning_rate": 3.47716211793997e-06, "loss": 0.0086, "reward": 2.102375030517578, "reward_std": 1.0163482427597046, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.46175000071525574, "step": 2439 }, { "completion_length": 143.84375, "epoch": 1.3055109684323167, "grad_norm": 1.2433795928955078, "kl": 0.17293114960193634, "learning_rate": 3.4757292726187664e-06, "loss": 0.0069, "reward": 2.1818125247955322, "reward_std": 0.6758522987365723, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4786875247955322, "step": 2440 }, { "completion_length": 110.6875, "epoch": 1.3060460139111825, "grad_norm": 1.8836408853530884, "kl": 0.2602764964103699, "learning_rate": 3.4742960490861995e-06, "loss": 0.0104, "reward": 2.859375, "reward_std": 0.39774754643440247, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.5, "step": 2441 }, { "completion_length": 115.96875, "epoch": 1.3065810593900482, "grad_norm": 1.3562005758285522, "kl": 0.18486328423023224, "learning_rate": 3.472862447897813e-06, "loss": 0.0074, "reward": 2.275749921798706, "reward_std": 0.3517155647277832, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.44762498140335083, "step": 2442 }, { "completion_length": 134.4375, "epoch": 1.3071161048689137, "grad_norm": 2.1011970043182373, "kl": 0.20087434351444244, "learning_rate": 3.4714284696092993e-06, "loss": 0.008, "reward": 1.9373750686645508, "reward_std": 0.7348538637161255, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.468625009059906, "step": 2443 }, { "completion_length": 111.53125, "epoch": 1.3076511503477795, "grad_norm": 1.123619556427002, "kl": 0.21484045684337616, "learning_rate": 3.469994114776496e-06, "loss": 0.0086, "reward": 1.8128437995910645, "reward_std": 0.637798547744751, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4690937399864197, "step": 2444 }, { "completion_length": 110.4375, "epoch": 1.3081861958266452, "grad_norm": 1.4249470233917236, "kl": 0.19131499528884888, "learning_rate": 3.4685593839553857e-06, "loss": 0.0077, "reward": 2.6741249561309814, "reward_std": 0.8453471660614014, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4710000157356262, "step": 2445 }, { "completion_length": 117.34375, "epoch": 1.308721241305511, "grad_norm": 2.1371283531188965, "kl": 0.2252042591571808, "learning_rate": 3.4671242777020985e-06, "loss": 0.009, "reward": 2.7959060668945312, "reward_std": 0.779892086982727, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4834062457084656, "step": 2446 }, { "completion_length": 128.46875, "epoch": 1.3092562867843767, "grad_norm": 1.6942105293273926, "kl": 0.28652456402778625, "learning_rate": 3.465688796572908e-06, "loss": 0.0115, "reward": 1.612375020980835, "reward_std": 0.9514068365097046, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.47175002098083496, "step": 2447 }, { "completion_length": 138.15625, "epoch": 1.3097913322632424, "grad_norm": 0.4835883378982544, "kl": 0.19595247507095337, "learning_rate": 3.4642529411242355e-06, "loss": 0.0078, "reward": 2.42578125, "reward_std": 0.8387709856033325, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47265625, "step": 2448 }, { "completion_length": 140.875, "epoch": 1.3103263777421081, "grad_norm": 0.9023667573928833, "kl": 0.16542856395244598, "learning_rate": 3.4628167119126452e-06, "loss": 0.0066, "reward": 2.141125202178955, "reward_std": 0.9987796545028687, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.453624963760376, "step": 2449 }, { "completion_length": 119.90625, "epoch": 1.3108614232209739, "grad_norm": 0.7935168147087097, "kl": 0.1970902979373932, "learning_rate": 3.4613801094948474e-06, "loss": 0.0079, "reward": 2.841031312942505, "reward_std": 1.047412633895874, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4816562533378601, "step": 2450 }, { "completion_length": 157.625, "epoch": 1.3113964686998396, "grad_norm": 3.9867076873779297, "kl": 0.17097046971321106, "learning_rate": 3.4599431344276967e-06, "loss": 0.0068, "reward": 1.5385937690734863, "reward_std": 0.727118968963623, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.41359376907348633, "step": 2451 }, { "completion_length": 116.90625, "epoch": 1.3119315141787051, "grad_norm": 0.5485380291938782, "kl": 0.18853071331977844, "learning_rate": 3.458505787268192e-06, "loss": 0.0075, "reward": 2.4650001525878906, "reward_std": 0.6432831883430481, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4650000035762787, "step": 2452 }, { "completion_length": 147.46875, "epoch": 1.3124665596575709, "grad_norm": 1.2196431159973145, "kl": 0.14839109778404236, "learning_rate": 3.4570680685734757e-06, "loss": 0.0059, "reward": 1.7646563053131104, "reward_std": 0.8848979473114014, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4365312457084656, "step": 2453 }, { "completion_length": 119.0625, "epoch": 1.3130016051364366, "grad_norm": 1.022589921951294, "kl": 0.1795956790447235, "learning_rate": 3.4556299789008374e-06, "loss": 0.0072, "reward": 1.9367187023162842, "reward_std": 0.8618490695953369, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48359376192092896, "step": 2454 }, { "completion_length": 115.46875, "epoch": 1.3135366506153023, "grad_norm": 0.6312917470932007, "kl": 0.16360706090927124, "learning_rate": 3.4541915188077067e-06, "loss": 0.0065, "reward": 2.1612186431884766, "reward_std": 0.7391286492347717, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4737187623977661, "step": 2455 }, { "completion_length": 142.34375, "epoch": 1.314071696094168, "grad_norm": 1.3204824924468994, "kl": 0.310271680355072, "learning_rate": 3.4527526888516583e-06, "loss": 0.0124, "reward": 1.5895313024520874, "reward_std": 0.9643387794494629, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.3707812428474426, "step": 2456 }, { "completion_length": 147.875, "epoch": 1.3146067415730336, "grad_norm": 0.590679407119751, "kl": 0.14110061526298523, "learning_rate": 3.4513134895904103e-06, "loss": 0.0056, "reward": 1.6359374523162842, "reward_std": 0.7173011898994446, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.44843751192092896, "step": 2457 }, { "completion_length": 120.53125, "epoch": 1.3151417870518993, "grad_norm": 48.722572326660156, "kl": 1.7826021909713745, "learning_rate": 3.4498739215818243e-06, "loss": 0.0713, "reward": 2.83203125, "reward_std": 0.3529890179634094, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.48828125, "step": 2458 }, { "completion_length": 135.125, "epoch": 1.315676832530765, "grad_norm": 1.4846711158752441, "kl": 0.16205081343650818, "learning_rate": 3.4484339853839044e-06, "loss": 0.0065, "reward": 1.9495313167572021, "reward_std": 0.8340961933135986, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4807812571525574, "step": 2459 }, { "completion_length": 103.96875, "epoch": 1.3162118780096308, "grad_norm": 2.839186906814575, "kl": 0.33969932794570923, "learning_rate": 3.4469936815547973e-06, "loss": 0.0136, "reward": 2.765625, "reward_std": 0.8035287261009216, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.5, "step": 2460 }, { "completion_length": 108.25, "epoch": 1.3167469234884965, "grad_norm": 0.9212089776992798, "kl": 0.24544599652290344, "learning_rate": 3.445553010652792e-06, "loss": 0.0098, "reward": 2.6308436393737793, "reward_std": 0.8341572284698486, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.49021875858306885, "step": 2461 }, { "completion_length": 103.4375, "epoch": 1.3172819689673623, "grad_norm": 0.5977236032485962, "kl": 0.16246731579303741, "learning_rate": 3.44411197323632e-06, "loss": 0.0065, "reward": 2.71875, "reward_std": 0.49341505765914917, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.5, "step": 2462 }, { "completion_length": 125.25, "epoch": 1.317817014446228, "grad_norm": 0.8402709364891052, "kl": 0.20466190576553345, "learning_rate": 3.442670569863956e-06, "loss": 0.0082, "reward": 3.081125020980835, "reward_std": 0.7514207363128662, "rewards/correctness_reward_func": 1.6875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.48737502098083496, "step": 2463 }, { "completion_length": 116.3125, "epoch": 1.3183520599250937, "grad_norm": 0.9183725714683533, "kl": 0.19507324695587158, "learning_rate": 3.4412288010944157e-06, "loss": 0.0078, "reward": 2.7172813415527344, "reward_std": 0.8147634267807007, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.4829062521457672, "step": 2464 }, { "completion_length": 146.1875, "epoch": 1.3188871054039593, "grad_norm": 0.5805155038833618, "kl": 0.1415332555770874, "learning_rate": 3.4397866674865553e-06, "loss": 0.0057, "reward": 1.90806245803833, "reward_std": 1.0149692296981812, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.42368751764297485, "step": 2465 }, { "completion_length": 109.0625, "epoch": 1.319422150882825, "grad_norm": 1.5635271072387695, "kl": 0.29531484842300415, "learning_rate": 3.4383441695993743e-06, "loss": 0.0118, "reward": 1.921968698501587, "reward_std": 0.2898004651069641, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4844687581062317, "step": 2466 }, { "completion_length": 141.34375, "epoch": 1.3199571963616907, "grad_norm": 120905.0, "kl": 593.162841796875, "learning_rate": 3.436901307992011e-06, "loss": 23.7265, "reward": 1.371687412261963, "reward_std": 0.8089511394500732, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.41856250166893005, "step": 2467 }, { "completion_length": 137.21875, "epoch": 1.3204922418405565, "grad_norm": 0.9011574983596802, "kl": 0.16231033205986023, "learning_rate": 3.435458083223747e-06, "loss": 0.0065, "reward": 1.5314687490463257, "reward_std": 0.6722092628479004, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4377187490463257, "step": 2468 }, { "completion_length": 120.40625, "epoch": 1.3210272873194222, "grad_norm": 0.8988216519355774, "kl": 0.17191457748413086, "learning_rate": 3.4340144958540055e-06, "loss": 0.0069, "reward": 2.40625, "reward_std": 0.6144241094589233, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.5, "step": 2469 }, { "completion_length": 145.0625, "epoch": 1.3215623327982877, "grad_norm": 0.6335205435752869, "kl": 0.16599547863006592, "learning_rate": 3.4325705464423465e-06, "loss": 0.0066, "reward": 1.5173749923706055, "reward_std": 0.6777305006980896, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.42362499237060547, "step": 2470 }, { "completion_length": 119.71875, "epoch": 1.3220973782771535, "grad_norm": 0.8050058484077454, "kl": 0.19457103312015533, "learning_rate": 3.431126235548473e-06, "loss": 0.0078, "reward": 1.8760311603546143, "reward_std": 1.032515287399292, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4697812795639038, "step": 2471 }, { "completion_length": 136.1875, "epoch": 1.3226324237560192, "grad_norm": 2.0788583755493164, "kl": 0.2110682725906372, "learning_rate": 3.4296815637322283e-06, "loss": 0.0084, "reward": 2.01171875, "reward_std": 0.7451719045639038, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.43359375, "step": 2472 }, { "completion_length": 142.84375, "epoch": 1.323167469234885, "grad_norm": 0.8209500908851624, "kl": 0.2272266149520874, "learning_rate": 3.4282365315535937e-06, "loss": 0.0091, "reward": 2.6875, "reward_std": 1.077003002166748, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.484375, "step": 2473 }, { "completion_length": 134.65625, "epoch": 1.3237025147137507, "grad_norm": 0.45783349871635437, "kl": 0.15537434816360474, "learning_rate": 3.4267911395726926e-06, "loss": 0.0062, "reward": 2.078125, "reward_std": 0.7050429582595825, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46875, "step": 2474 }, { "completion_length": 136.25, "epoch": 1.3242375601926164, "grad_norm": 1.4545936584472656, "kl": 0.22056595981121063, "learning_rate": 3.425345388349787e-06, "loss": 0.0088, "reward": 1.5097187757492065, "reward_std": 0.7870824337005615, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.44721874594688416, "step": 2475 }, { "completion_length": 123.78125, "epoch": 1.3247726056714821, "grad_norm": 1.7165733575820923, "kl": 0.17625294625759125, "learning_rate": 3.4238992784452747e-06, "loss": 0.0071, "reward": 2.3088126182556152, "reward_std": 0.9677552580833435, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4806874990463257, "step": 2476 }, { "completion_length": 150.71875, "epoch": 1.3253076511503479, "grad_norm": 0.7202460169792175, "kl": 0.1648600846529007, "learning_rate": 3.4224528104196993e-06, "loss": 0.0066, "reward": 1.1458125114440918, "reward_std": 0.5558425188064575, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4270625114440918, "step": 2477 }, { "completion_length": 116.4375, "epoch": 1.3258426966292136, "grad_norm": 1.6296069622039795, "kl": 0.20003022253513336, "learning_rate": 3.4210059848337375e-06, "loss": 0.008, "reward": 2.0925936698913574, "reward_std": 0.8620115518569946, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4832187294960022, "step": 2478 }, { "completion_length": 155.75, "epoch": 1.3263777421080791, "grad_norm": 1.702033281326294, "kl": 0.1528056561946869, "learning_rate": 3.4195588022482066e-06, "loss": 0.0061, "reward": 1.6814374923706055, "reward_std": 0.8835145235061646, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.44706249237060547, "step": 2479 }, { "completion_length": 151.46875, "epoch": 1.3269127875869449, "grad_norm": 0.7660238742828369, "kl": 0.18466520309448242, "learning_rate": 3.418111263224064e-06, "loss": 0.0074, "reward": 1.3994375467300415, "reward_std": 0.973989725112915, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.41506248712539673, "step": 2480 }, { "completion_length": 126.15625, "epoch": 1.3274478330658106, "grad_norm": 2.1925153732299805, "kl": 0.22186830639839172, "learning_rate": 3.4166633683224014e-06, "loss": 0.0089, "reward": 2.4375, "reward_std": 0.7058678865432739, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46875, "step": 2481 }, { "completion_length": 121.90625, "epoch": 1.3279828785446763, "grad_norm": 4.396119117736816, "kl": 0.2683795392513275, "learning_rate": 3.415215118104452e-06, "loss": 0.0107, "reward": 2.757687568664551, "reward_std": 0.5561203360557556, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.476437509059906, "step": 2482 }, { "completion_length": 128.59375, "epoch": 1.328517924023542, "grad_norm": 1.1585112810134888, "kl": 0.21039333939552307, "learning_rate": 3.4137665131315847e-06, "loss": 0.0084, "reward": 1.8959063291549683, "reward_std": 0.7350084781646729, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4584062695503235, "step": 2483 }, { "completion_length": 122.03125, "epoch": 1.3290529695024076, "grad_norm": 0.8074317574501038, "kl": 0.20339643955230713, "learning_rate": 3.4123175539653062e-06, "loss": 0.0081, "reward": 2.1682186126708984, "reward_std": 0.9572754502296448, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4650937616825104, "step": 2484 }, { "completion_length": 117.5625, "epoch": 1.3295880149812733, "grad_norm": 0.5542053580284119, "kl": 0.14863300323486328, "learning_rate": 3.410868241167263e-06, "loss": 0.0059, "reward": 2.296750068664551, "reward_std": 0.8141327500343323, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.499875009059906, "step": 2485 }, { "completion_length": 122.09375, "epoch": 1.330123060460139, "grad_norm": 0.6110469102859497, "kl": 0.1902640163898468, "learning_rate": 3.409418575299233e-06, "loss": 0.0076, "reward": 1.59584379196167, "reward_std": 0.5582679510116577, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.47084373235702515, "step": 2486 }, { "completion_length": 133.65625, "epoch": 1.3306581059390048, "grad_norm": 1.3268468379974365, "kl": 0.205396369099617, "learning_rate": 3.4079685569231373e-06, "loss": 0.0082, "reward": 2.4386563301086426, "reward_std": 0.8352075815200806, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4699062407016754, "step": 2487 }, { "completion_length": 126.40625, "epoch": 1.3311931514178705, "grad_norm": 1.4296648502349854, "kl": 0.20382162928581238, "learning_rate": 3.40651818660103e-06, "loss": 0.0082, "reward": 1.7582813501358032, "reward_std": 0.8739408850669861, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.43015626072883606, "step": 2488 }, { "completion_length": 108.09375, "epoch": 1.3317281968967363, "grad_norm": 1.0058822631835938, "kl": 0.20383204519748688, "learning_rate": 3.405067464895103e-06, "loss": 0.0082, "reward": 1.4982500076293945, "reward_std": 0.23959019780158997, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.49825000762939453, "step": 2489 }, { "completion_length": 139.09375, "epoch": 1.332263242375602, "grad_norm": 2.6515517234802246, "kl": 0.3774847090244293, "learning_rate": 3.4036163923676814e-06, "loss": 0.0151, "reward": 2.037968635559082, "reward_std": 0.9887194037437439, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4598437547683716, "step": 2490 }, { "completion_length": 174.46875, "epoch": 1.3327982878544677, "grad_norm": 2.3808648586273193, "kl": 0.1847062110900879, "learning_rate": 3.4021649695812316e-06, "loss": 0.0074, "reward": 0.9227499961853027, "reward_std": 0.46895793080329895, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.28125, "rewards/xmlcount_reward_func": 0.34462499618530273, "step": 2491 }, { "completion_length": 117.84375, "epoch": 1.3333333333333333, "grad_norm": 1.4428505897521973, "kl": 0.22128844261169434, "learning_rate": 3.400713197098352e-06, "loss": 0.0089, "reward": 1.5677499771118164, "reward_std": 0.579901933670044, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4896250069141388, "step": 2492 }, { "completion_length": 118.125, "epoch": 1.333868378812199, "grad_norm": 1.0324240922927856, "kl": 0.20052699744701385, "learning_rate": 3.3992610754817764e-06, "loss": 0.008, "reward": 2.7556874752044678, "reward_std": 0.6999251842498779, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4744374752044678, "step": 2493 }, { "completion_length": 133.25, "epoch": 1.3344034242910647, "grad_norm": 1.220207691192627, "kl": 0.16193103790283203, "learning_rate": 3.3978086052943766e-06, "loss": 0.0065, "reward": 2.19921875, "reward_std": 0.9064971804618835, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46484375, "step": 2494 }, { "completion_length": 111.5625, "epoch": 1.3349384697699305, "grad_norm": 2.1020355224609375, "kl": 0.226359024643898, "learning_rate": 3.396355787099156e-06, "loss": 0.0091, "reward": 2.3246562480926514, "reward_std": 0.8890061974525452, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48090624809265137, "step": 2495 }, { "completion_length": 133.78125, "epoch": 1.3354735152487962, "grad_norm": 1.2387248277664185, "kl": 0.16734248399734497, "learning_rate": 3.394902621459256e-06, "loss": 0.0067, "reward": 1.8670625686645508, "reward_std": 0.5442061424255371, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.492062509059906, "step": 2496 }, { "completion_length": 109.125, "epoch": 1.3360085607276617, "grad_norm": 1.3690600395202637, "kl": 0.21485736966133118, "learning_rate": 3.3934491089379516e-06, "loss": 0.0086, "reward": 1.5772500038146973, "reward_std": 0.41216593980789185, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.49912500381469727, "step": 2497 }, { "completion_length": 139.625, "epoch": 1.3365436062065275, "grad_norm": 1.192476511001587, "kl": 0.162611186504364, "learning_rate": 3.3919952500986525e-06, "loss": 0.0065, "reward": 1.7985312938690186, "reward_std": 0.7701399326324463, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4235312342643738, "step": 2498 }, { "completion_length": 119.8125, "epoch": 1.3370786516853932, "grad_norm": 2.775611639022827, "kl": 0.37884312868118286, "learning_rate": 3.3905410455049004e-06, "loss": 0.0152, "reward": 1.7703750133514404, "reward_std": 0.9294081926345825, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48912501335144043, "step": 2499 }, { "completion_length": 115.9375, "epoch": 1.337613697164259, "grad_norm": 0.6280936002731323, "kl": 0.19701386988162994, "learning_rate": 3.389086495720374e-06, "loss": 0.0079, "reward": 1.9961249828338623, "reward_std": 0.8579071760177612, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4648749828338623, "step": 2500 }, { "completion_length": 135.15625, "epoch": 1.3381487426431247, "grad_norm": 0.6861547827720642, "kl": 0.18007132411003113, "learning_rate": 3.387631601308885e-06, "loss": 0.0072, "reward": 1.995843768119812, "reward_std": 1.017669677734375, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4177187383174896, "step": 2501 }, { "completion_length": 153.40625, "epoch": 1.3386837881219904, "grad_norm": 0.6650646924972534, "kl": 0.20842871069908142, "learning_rate": 3.386176362834378e-06, "loss": 0.0083, "reward": 1.764312505722046, "reward_std": 0.36508458852767944, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4049375057220459, "step": 2502 }, { "completion_length": 136.625, "epoch": 1.3392188336008561, "grad_norm": 1.780473232269287, "kl": 0.17258043587207794, "learning_rate": 3.384720780860932e-06, "loss": 0.0069, "reward": 1.7248437404632568, "reward_std": 0.42830994725227356, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.42796874046325684, "step": 2503 }, { "completion_length": 108.65625, "epoch": 1.3397538790797219, "grad_norm": 1.0757930278778076, "kl": 0.22500914335250854, "learning_rate": 3.3832648559527566e-06, "loss": 0.009, "reward": 2.7822813987731934, "reward_std": 0.4764987826347351, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4854062497615814, "step": 2504 }, { "completion_length": 138.34375, "epoch": 1.3402889245585876, "grad_norm": 1.798892617225647, "kl": 0.301491379737854, "learning_rate": 3.381808588674197e-06, "loss": 0.0121, "reward": 2.170875072479248, "reward_std": 1.095942497253418, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.45212501287460327, "step": 2505 }, { "completion_length": 131.1875, "epoch": 1.3408239700374531, "grad_norm": 1.6512318849563599, "kl": 0.16475284099578857, "learning_rate": 3.380351979589732e-06, "loss": 0.0066, "reward": 1.69140625, "reward_std": 0.3546988368034363, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48828125, "step": 2506 }, { "completion_length": 137.34375, "epoch": 1.3413590155163189, "grad_norm": 1.570631742477417, "kl": 0.23095370829105377, "learning_rate": 3.378895029263969e-06, "loss": 0.0092, "reward": 2.1685311794281006, "reward_std": 0.7222875356674194, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46540626883506775, "step": 2507 }, { "completion_length": 142.21875, "epoch": 1.3418940609951846, "grad_norm": 0.9886799454689026, "kl": 0.19072353839874268, "learning_rate": 3.3774377382616514e-06, "loss": 0.0076, "reward": 2.501093864440918, "reward_std": 1.20607590675354, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4698437452316284, "step": 2508 }, { "completion_length": 108.1875, "epoch": 1.3424291064740503, "grad_norm": 1.4681521654129028, "kl": 0.2975027561187744, "learning_rate": 3.3759801071476517e-06, "loss": 0.0119, "reward": 2.1561875343322754, "reward_std": 0.6591271162033081, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.499937504529953, "step": 2509 }, { "completion_length": 99.0625, "epoch": 1.342964151952916, "grad_norm": 2.2006499767303467, "kl": 0.2290429323911667, "learning_rate": 3.374522136486977e-06, "loss": 0.0092, "reward": 3.3125, "reward_std": 0.4082317352294922, "rewards/correctness_reward_func": 1.8125, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 2510 }, { "completion_length": 140.90625, "epoch": 1.3434991974317816, "grad_norm": 0.8381359577178955, "kl": 0.1972169429063797, "learning_rate": 3.373063826844764e-06, "loss": 0.0079, "reward": 1.8045625686645508, "reward_std": 0.8118549585342407, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.460812509059906, "step": 2511 }, { "completion_length": 143.0, "epoch": 1.3440342429106473, "grad_norm": 0.7079004049301147, "kl": 0.2167307287454605, "learning_rate": 3.371605178786281e-06, "loss": 0.0087, "reward": 1.8520312309265137, "reward_std": 1.001982569694519, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.46140623092651367, "step": 2512 }, { "completion_length": 132.28125, "epoch": 1.344569288389513, "grad_norm": 1.0467009544372559, "kl": 0.18269190192222595, "learning_rate": 3.370146192876929e-06, "loss": 0.0073, "reward": 2.12109375, "reward_std": 0.9276348352432251, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.49609375, "step": 2513 }, { "completion_length": 121.25, "epoch": 1.3451043338683788, "grad_norm": 4.680449962615967, "kl": 0.6213197112083435, "learning_rate": 3.368686869682238e-06, "loss": 0.0249, "reward": 2.19140625, "reward_std": 1.0900015830993652, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.48828125, "step": 2514 }, { "completion_length": 119.71875, "epoch": 1.3456393793472445, "grad_norm": 0.8813492655754089, "kl": 0.22274987399578094, "learning_rate": 3.3672272097678706e-06, "loss": 0.0089, "reward": 2.0390625, "reward_std": 0.720655083656311, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.4921875, "step": 2515 }, { "completion_length": 130.15625, "epoch": 1.3461744248261103, "grad_norm": 0.9924176931381226, "kl": 0.18909716606140137, "learning_rate": 3.365767213699618e-06, "loss": 0.0076, "reward": 2.05078125, "reward_std": 0.9879754781723022, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48828125, "step": 2516 }, { "completion_length": 112.40625, "epoch": 1.346709470304976, "grad_norm": 6.154167175292969, "kl": 0.2115660011768341, "learning_rate": 3.364306882043403e-06, "loss": 0.0085, "reward": 1.9958750009536743, "reward_std": 0.793519139289856, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4646250009536743, "step": 2517 }, { "completion_length": 152.78125, "epoch": 1.3472445157838417, "grad_norm": 0.9099205732345581, "kl": 0.22239172458648682, "learning_rate": 3.362846215365278e-06, "loss": 0.0089, "reward": 1.955125093460083, "reward_std": 0.8835395574569702, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.40825000405311584, "step": 2518 }, { "completion_length": 118.21875, "epoch": 1.3477795612627073, "grad_norm": 0.6640486121177673, "kl": 0.21841560304164886, "learning_rate": 3.3613852142314256e-06, "loss": 0.0087, "reward": 2.3828125, "reward_std": 0.19977308809757233, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4765625, "step": 2519 }, { "completion_length": 114.28125, "epoch": 1.348314606741573, "grad_norm": 1.673148512840271, "kl": 0.225685715675354, "learning_rate": 3.359923879208158e-06, "loss": 0.009, "reward": 2.26953125, "reward_std": 0.7523424625396729, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.48828125, "step": 2520 }, { "completion_length": 124.59375, "epoch": 1.3488496522204387, "grad_norm": 0.9367106556892395, "kl": 0.17908290028572083, "learning_rate": 3.3584622108619158e-06, "loss": 0.0072, "reward": 2.88671875, "reward_std": 0.8667501211166382, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48046875, "step": 2521 }, { "completion_length": 121.3125, "epoch": 1.3493846976993045, "grad_norm": 0.7804650068283081, "kl": 0.18282006680965424, "learning_rate": 3.3570002097592715e-06, "loss": 0.0073, "reward": 1.8828125, "reward_std": 0.7762312293052673, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4765625, "step": 2522 }, { "completion_length": 116.65625, "epoch": 1.3499197431781702, "grad_norm": 2.409280776977539, "kl": 0.24603351950645447, "learning_rate": 3.355537876466923e-06, "loss": 0.0098, "reward": 2.52553129196167, "reward_std": 0.6906237602233887, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.49428123235702515, "step": 2523 }, { "completion_length": 164.09375, "epoch": 1.3504547886570357, "grad_norm": 0.5958527326583862, "kl": 0.14910826086997986, "learning_rate": 3.354075211551698e-06, "loss": 0.006, "reward": 1.3055000305175781, "reward_std": 1.035674810409546, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.21875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.265625, "rewards/xmlcount_reward_func": 0.38362500071525574, "step": 2524 }, { "completion_length": 109.21875, "epoch": 1.3509898341359015, "grad_norm": 0.6761331558227539, "kl": 0.22220848500728607, "learning_rate": 3.3526122155805562e-06, "loss": 0.0089, "reward": 1.9375, "reward_std": 0.1157275140285492, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.5, "step": 2525 }, { "completion_length": 118.125, "epoch": 1.3515248796147672, "grad_norm": 1.1581785678863525, "kl": 0.2041839361190796, "learning_rate": 3.3511488891205813e-06, "loss": 0.0082, "reward": 2.265625, "reward_std": 0.5425772070884705, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.484375, "step": 2526 }, { "completion_length": 118.6875, "epoch": 1.352059925093633, "grad_norm": 0.7156517505645752, "kl": 0.31597667932510376, "learning_rate": 3.3496852327389864e-06, "loss": 0.0126, "reward": 2.166374921798706, "reward_std": 0.35974615812301636, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.47887498140335083, "step": 2527 }, { "completion_length": 122.34375, "epoch": 1.3525949705724987, "grad_norm": 2.316873073577881, "kl": 0.36668649315834045, "learning_rate": 3.348221247003113e-06, "loss": 0.0147, "reward": 1.909406304359436, "reward_std": 1.0360515117645264, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.45628124475479126, "step": 2528 }, { "completion_length": 166.625, "epoch": 1.3531300160513644, "grad_norm": 1.7052477598190308, "kl": 0.1387244462966919, "learning_rate": 3.34675693248043e-06, "loss": 0.0055, "reward": 1.3203125, "reward_std": 0.5273936986923218, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.28125, "rewards/xmlcount_reward_func": 0.3671875, "step": 2529 }, { "completion_length": 109.9375, "epoch": 1.3536650615302301, "grad_norm": 1.5441467761993408, "kl": 0.2747577726840973, "learning_rate": 3.345292289738534e-06, "loss": 0.011, "reward": 1.9896562099456787, "reward_std": 0.46717432141304016, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4740312397480011, "step": 2530 }, { "completion_length": 143.28125, "epoch": 1.3542001070090959, "grad_norm": 0.8102260828018188, "kl": 0.17106392979621887, "learning_rate": 3.3438273193451487e-06, "loss": 0.0068, "reward": 1.8439688682556152, "reward_std": 0.7750734686851501, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4689687490463257, "step": 2531 }, { "completion_length": 128.8125, "epoch": 1.3547351524879616, "grad_norm": 9.880537033081055, "kl": 0.5552617907524109, "learning_rate": 3.3423620218681246e-06, "loss": 0.0222, "reward": 2.975062370300293, "reward_std": 0.8968117237091064, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4750624895095825, "step": 2532 }, { "completion_length": 98.78125, "epoch": 1.3552701979668271, "grad_norm": 4439005184.0, "kl": 4656536.0, "learning_rate": 3.340896397875438e-06, "loss": 186261.4531, "reward": 2.736281156539917, "reward_std": 0.3576902747154236, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48628124594688416, "step": 2533 }, { "completion_length": 157.75, "epoch": 1.3558052434456929, "grad_norm": 0.8142717480659485, "kl": 0.15898475050926208, "learning_rate": 3.339430447935195e-06, "loss": 0.0064, "reward": 1.2937812805175781, "reward_std": 0.5558464527130127, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.45003125071525574, "step": 2534 }, { "completion_length": 125.625, "epoch": 1.3563402889245586, "grad_norm": 1.3240303993225098, "kl": 0.2004614621400833, "learning_rate": 3.337964172615624e-06, "loss": 0.008, "reward": 2.6087188720703125, "reward_std": 1.1483099460601807, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46809375286102295, "step": 2535 }, { "completion_length": 130.625, "epoch": 1.3568753344034243, "grad_norm": 0.9371070861816406, "kl": 0.18274953961372375, "learning_rate": 3.3364975724850825e-06, "loss": 0.0073, "reward": 2.1236250400543213, "reward_std": 0.9068877696990967, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4204999804496765, "step": 2536 }, { "completion_length": 124.375, "epoch": 1.35741037988229, "grad_norm": 1.0951597690582275, "kl": 0.19268754124641418, "learning_rate": 3.335030648112052e-06, "loss": 0.0077, "reward": 2.130218744277954, "reward_std": 1.0724289417266846, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4270937442779541, "step": 2537 }, { "completion_length": 127.34375, "epoch": 1.3579454253611556, "grad_norm": 1.3217957019805908, "kl": 0.18207097053527832, "learning_rate": 3.3335634000651414e-06, "loss": 0.0073, "reward": 2.0361876487731934, "reward_std": 0.6979489922523499, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4580624997615814, "step": 2538 }, { "completion_length": 125.6875, "epoch": 1.3584804708400213, "grad_norm": 0.8652077317237854, "kl": 0.19043073058128357, "learning_rate": 3.3320958289130832e-06, "loss": 0.0076, "reward": 2.4359688758850098, "reward_std": 0.9963562488555908, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4672187566757202, "step": 2539 }, { "completion_length": 143.65625, "epoch": 1.359015516318887, "grad_norm": 1.2619214057922363, "kl": 0.19318687915802002, "learning_rate": 3.3306279352247375e-06, "loss": 0.0077, "reward": 1.7566249370574951, "reward_std": 0.8885384798049927, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4284999966621399, "step": 2540 }, { "completion_length": 126.09375, "epoch": 1.3595505617977528, "grad_norm": 1.0752469301223755, "kl": 0.27293622493743896, "learning_rate": 3.3291597195690854e-06, "loss": 0.0109, "reward": 2.921875, "reward_std": 0.90977942943573, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.5, "step": 2541 }, { "completion_length": 116.75, "epoch": 1.3600856072766185, "grad_norm": 0.8592517375946045, "kl": 0.20585918426513672, "learning_rate": 3.3276911825152386e-06, "loss": 0.0082, "reward": 2.625, "reward_std": 0.7007359266281128, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.5, "step": 2542 }, { "completion_length": 139.9375, "epoch": 1.3606206527554843, "grad_norm": 0.8547224998474121, "kl": 0.1648978292942047, "learning_rate": 3.3262223246324284e-06, "loss": 0.0066, "reward": 2.1191248893737793, "reward_std": 0.8326389193534851, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46287500858306885, "step": 2543 }, { "completion_length": 113.65625, "epoch": 1.36115569823435, "grad_norm": 1.2978132963180542, "kl": 0.17458921670913696, "learning_rate": 3.324753146490012e-06, "loss": 0.007, "reward": 2.7049062252044678, "reward_std": 1.025928258895874, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48615625500679016, "step": 2544 }, { "completion_length": 106.625, "epoch": 1.3616907437132157, "grad_norm": 6.933866024017334, "kl": 0.7680225968360901, "learning_rate": 3.3232836486574725e-06, "loss": 0.0307, "reward": 1.3858437538146973, "reward_std": 0.3956807553768158, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47959375381469727, "step": 2545 }, { "completion_length": 111.125, "epoch": 1.3622257891920813, "grad_norm": 1.5875426530838013, "kl": 0.14838874340057373, "learning_rate": 3.321813831704414e-06, "loss": 0.0059, "reward": 2.5152812004089355, "reward_std": 0.8502596020698547, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4840312600135803, "step": 2546 }, { "completion_length": 98.375, "epoch": 1.362760834670947, "grad_norm": 2.574869394302368, "kl": 0.2586454749107361, "learning_rate": 3.320343696200566e-06, "loss": 0.0103, "reward": 1.9232187271118164, "reward_std": 0.6933853626251221, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4544687569141388, "step": 2547 }, { "completion_length": 135.125, "epoch": 1.3632958801498127, "grad_norm": 28.14851188659668, "kl": 0.5539758205413818, "learning_rate": 3.318873242715782e-06, "loss": 0.0222, "reward": 1.7740000486373901, "reward_std": 0.9675531983375549, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.46149998903274536, "step": 2548 }, { "completion_length": 101.09375, "epoch": 1.3638309256286785, "grad_norm": 1.085443139076233, "kl": 0.21699589490890503, "learning_rate": 3.317402471820037e-06, "loss": 0.0087, "reward": 2.8695311546325684, "reward_std": 0.7161108255386353, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4945312440395355, "step": 2549 }, { "completion_length": 134.875, "epoch": 1.3643659711075442, "grad_norm": 1.5631564855575562, "kl": 0.17450112104415894, "learning_rate": 3.3159313840834304e-06, "loss": 0.007, "reward": 1.8515625, "reward_std": 1.0829808712005615, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4609375, "step": 2550 }, { "completion_length": 131.9375, "epoch": 1.3649010165864097, "grad_norm": 0.5030618906021118, "kl": 0.16626793146133423, "learning_rate": 3.3144599800761846e-06, "loss": 0.0067, "reward": 2.3349688053131104, "reward_std": 0.7109841108322144, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4599687457084656, "step": 2551 }, { "completion_length": 110.53125, "epoch": 1.3654360620652755, "grad_norm": 1.7191380262374878, "kl": 0.18751701712608337, "learning_rate": 3.3129882603686424e-06, "loss": 0.0075, "reward": 2.3515625, "reward_std": 0.567357063293457, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4765625, "step": 2552 }, { "completion_length": 136.40625, "epoch": 1.3659711075441412, "grad_norm": 1.187180519104004, "kl": 0.17458827793598175, "learning_rate": 3.3115162255312716e-06, "loss": 0.007, "reward": 1.4683749675750732, "reward_std": 0.8074296712875366, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.42149999737739563, "step": 2553 }, { "completion_length": 122.09375, "epoch": 1.366506153023007, "grad_norm": 1.493418574333191, "kl": 0.24402856826782227, "learning_rate": 3.3100438761346613e-06, "loss": 0.0098, "reward": 2.16015625, "reward_std": 0.39260220527648926, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48828125, "step": 2554 }, { "completion_length": 123.6875, "epoch": 1.3670411985018727, "grad_norm": 0.8862495422363281, "kl": 0.15807107090950012, "learning_rate": 3.3085712127495227e-06, "loss": 0.0063, "reward": 2.3040313720703125, "reward_std": 0.7360543012619019, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.49153125286102295, "step": 2555 }, { "completion_length": 149.0, "epoch": 1.3675762439807384, "grad_norm": 1.4823869466781616, "kl": 0.18573500216007233, "learning_rate": 3.307098235946687e-06, "loss": 0.0074, "reward": 2.139937400817871, "reward_std": 0.5849091410636902, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.43681252002716064, "step": 2556 }, { "completion_length": 129.25, "epoch": 1.3681112894596041, "grad_norm": 2.3377208709716797, "kl": 0.2578750252723694, "learning_rate": 3.3056249462971085e-06, "loss": 0.0103, "reward": 1.9469687938690186, "reward_std": 0.8243329524993896, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4938437342643738, "step": 2557 }, { "completion_length": 134.1875, "epoch": 1.3686463349384699, "grad_norm": 0.8474524021148682, "kl": 0.1324760466814041, "learning_rate": 3.3041513443718637e-06, "loss": 0.0053, "reward": 1.7546563148498535, "reward_std": 0.6784718632698059, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.44215625524520874, "step": 2558 }, { "completion_length": 137.75, "epoch": 1.3691813804173354, "grad_norm": 1.2028073072433472, "kl": 0.1776793748140335, "learning_rate": 3.3026774307421476e-06, "loss": 0.0071, "reward": 2.1015625, "reward_std": 0.9347288608551025, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.4921875, "step": 2559 }, { "completion_length": 150.0, "epoch": 1.3697164258962011, "grad_norm": 14.177599906921387, "kl": 0.48528414964675903, "learning_rate": 3.301203205979279e-06, "loss": 0.0194, "reward": 1.1317187547683716, "reward_std": 0.6160382032394409, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4129687547683716, "step": 2560 }, { "completion_length": 110.46875, "epoch": 1.3702514713750669, "grad_norm": 1.2733639478683472, "kl": 0.24654889106750488, "learning_rate": 3.2997286706546933e-06, "loss": 0.0099, "reward": 2.636406183242798, "reward_std": 0.953360915184021, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4801562428474426, "step": 2561 }, { "completion_length": 126.46875, "epoch": 1.3707865168539326, "grad_norm": 0.9232872724533081, "kl": 0.16444310545921326, "learning_rate": 3.29825382533995e-06, "loss": 0.0066, "reward": 2.4579687118530273, "reward_std": 1.0065324306488037, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.47359374165534973, "step": 2562 }, { "completion_length": 130.21875, "epoch": 1.3713215623327983, "grad_norm": 0.7978324890136719, "kl": 0.2625385522842407, "learning_rate": 3.2967786706067283e-06, "loss": 0.0105, "reward": 1.7975624799728394, "reward_std": 0.6121734380722046, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.43818750977516174, "step": 2563 }, { "completion_length": 119.875, "epoch": 1.3718566078116639, "grad_norm": 1.3427897691726685, "kl": 0.20950953662395477, "learning_rate": 3.295303207026825e-06, "loss": 0.0084, "reward": 2.2578125, "reward_std": 0.8958110213279724, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4765625, "step": 2564 }, { "completion_length": 143.6875, "epoch": 1.3723916532905296, "grad_norm": 0.7586553692817688, "kl": 0.17725013196468353, "learning_rate": 3.293827435172159e-06, "loss": 0.0071, "reward": 2.1404685974121094, "reward_std": 0.8131930828094482, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4217187464237213, "step": 2565 }, { "completion_length": 129.71875, "epoch": 1.3729266987693953, "grad_norm": 2.135573148727417, "kl": 0.18201330304145813, "learning_rate": 3.2923513556147666e-06, "loss": 0.0073, "reward": 1.5213124752044678, "reward_std": 0.8188133835792542, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.45881250500679016, "step": 2566 }, { "completion_length": 159.5, "epoch": 1.373461744248261, "grad_norm": 1.3830008506774902, "kl": 0.21285665035247803, "learning_rate": 3.2908749689268056e-06, "loss": 0.0085, "reward": 1.3289687633514404, "reward_std": 0.7496634125709534, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.43834376335144043, "step": 2567 }, { "completion_length": 153.1875, "epoch": 1.3739967897271268, "grad_norm": 1.747954249382019, "kl": 0.15350008010864258, "learning_rate": 3.2893982756805516e-06, "loss": 0.0061, "reward": 1.97265625, "reward_std": 0.973882794380188, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.44140625, "step": 2568 }, { "completion_length": 134.375, "epoch": 1.3745318352059925, "grad_norm": 14.781410217285156, "kl": 0.9914472103118896, "learning_rate": 3.2879212764483993e-06, "loss": 0.0397, "reward": 2.3236563205718994, "reward_std": 1.2957594394683838, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46428126096725464, "step": 2569 }, { "completion_length": 125.28125, "epoch": 1.3750668806848583, "grad_norm": 1473.8648681640625, "kl": 56.514556884765625, "learning_rate": 3.2864439718028605e-06, "loss": 2.2606, "reward": 1.3630311489105225, "reward_std": 0.48414722084999084, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.472406268119812, "step": 2570 }, { "completion_length": 125.21875, "epoch": 1.375601926163724, "grad_norm": 0.3025059998035431, "kl": 0.14388953149318695, "learning_rate": 3.2849663623165685e-06, "loss": 0.0058, "reward": 2.10546875, "reward_std": 0.410896360874176, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48046875, "step": 2571 }, { "completion_length": 121.4375, "epoch": 1.3761369716425897, "grad_norm": 1.0914082527160645, "kl": 0.22819462418556213, "learning_rate": 3.283488448562273e-06, "loss": 0.0091, "reward": 2.125, "reward_std": 0.883864164352417, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.484375, "step": 2572 }, { "completion_length": 116.0, "epoch": 1.3766720171214553, "grad_norm": 1.0443090200424194, "kl": 0.18210430443286896, "learning_rate": 3.28201023111284e-06, "loss": 0.0073, "reward": 2.234375, "reward_std": 0.5640586614608765, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.5, "step": 2573 }, { "completion_length": 138.5625, "epoch": 1.377207062600321, "grad_norm": 1.2492769956588745, "kl": 0.3270419239997864, "learning_rate": 3.2805317105412572e-06, "loss": 0.0131, "reward": 2.3138437271118164, "reward_std": 0.8881034851074219, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4700937569141388, "step": 2574 }, { "completion_length": 115.75, "epoch": 1.3777421080791867, "grad_norm": 0.8793543577194214, "kl": 0.2012200504541397, "learning_rate": 3.2790528874206255e-06, "loss": 0.008, "reward": 1.94140625, "reward_std": 1.0392080545425415, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48828125, "step": 2575 }, { "completion_length": 98.21875, "epoch": 1.3782771535580525, "grad_norm": 0.8468867540359497, "kl": 0.30137884616851807, "learning_rate": 3.2775737623241664e-06, "loss": 0.0121, "reward": 2.722343921661377, "reward_std": 0.45954060554504395, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4723437428474426, "step": 2576 }, { "completion_length": 121.21875, "epoch": 1.3788121990369182, "grad_norm": 1.662316918373108, "kl": 0.19167976081371307, "learning_rate": 3.276094335825217e-06, "loss": 0.0077, "reward": 2.171999931335449, "reward_std": 0.702976405620575, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.468874990940094, "step": 2577 }, { "completion_length": 116.3125, "epoch": 1.3793472445157837, "grad_norm": 0.9129929542541504, "kl": 0.20013877749443054, "learning_rate": 3.2746146084972304e-06, "loss": 0.008, "reward": 2.71875, "reward_std": 0.956214964389801, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.5, "step": 2578 }, { "completion_length": 127.46875, "epoch": 1.3798822899946495, "grad_norm": 2.9534566402435303, "kl": 0.2833893895149231, "learning_rate": 3.2731345809137784e-06, "loss": 0.0113, "reward": 1.371749997138977, "reward_std": 0.39725765585899353, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48112499713897705, "step": 2579 }, { "completion_length": 118.46875, "epoch": 1.3804173354735152, "grad_norm": 1.4918959140777588, "kl": 0.19775131344795227, "learning_rate": 3.271654253648547e-06, "loss": 0.0079, "reward": 1.76953125, "reward_std": 0.4618768095970154, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48828125, "step": 2580 }, { "completion_length": 115.9375, "epoch": 1.380952380952381, "grad_norm": 1.5315607786178589, "kl": 0.3014313578605652, "learning_rate": 3.2701736272753405e-06, "loss": 0.0121, "reward": 2.199093818664551, "reward_std": 0.7840831279754639, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.495968759059906, "step": 2581 }, { "completion_length": 117.71875, "epoch": 1.3814874264312467, "grad_norm": 2.892808675765991, "kl": 0.5030605792999268, "learning_rate": 3.268692702368077e-06, "loss": 0.0201, "reward": 2.765625, "reward_std": 1.016887903213501, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.5, "step": 2582 }, { "completion_length": 114.53125, "epoch": 1.3820224719101124, "grad_norm": 1.925706148147583, "kl": 0.38614553213119507, "learning_rate": 3.2672114795007926e-06, "loss": 0.0154, "reward": 1.81640625, "reward_std": 0.40151113271713257, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48828125, "step": 2583 }, { "completion_length": 118.96875, "epoch": 1.3825575173889781, "grad_norm": 0.7265208959579468, "kl": 0.20438796281814575, "learning_rate": 3.2657299592476367e-06, "loss": 0.0082, "reward": 2.382687568664551, "reward_std": 1.0700429677963257, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.476437509059906, "step": 2584 }, { "completion_length": 128.84375, "epoch": 1.3830925628678439, "grad_norm": 5.6319580078125, "kl": 0.24333544075489044, "learning_rate": 3.264248142182875e-06, "loss": 0.0097, "reward": 1.9893124103546143, "reward_std": 0.7121537327766418, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4580624997615814, "step": 2585 }, { "completion_length": 126.59375, "epoch": 1.3836276083467094, "grad_norm": 0.7773366570472717, "kl": 0.2173423022031784, "learning_rate": 3.2627660288808888e-06, "loss": 0.0087, "reward": 2.1171875, "reward_std": 1.0145461559295654, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4765625, "step": 2586 }, { "completion_length": 138.28125, "epoch": 1.3841626538255751, "grad_norm": 0.7294819951057434, "kl": 0.2292463481426239, "learning_rate": 3.2612836199161724e-06, "loss": 0.0092, "reward": 2.1103124618530273, "reward_std": 1.2295646667480469, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4696875214576721, "step": 2587 }, { "completion_length": 127.0, "epoch": 1.3846976993044409, "grad_norm": 9.309925079345703, "kl": 0.5235152244567871, "learning_rate": 3.2598009158633376e-06, "loss": 0.0209, "reward": 1.8984375, "reward_std": 0.9376453161239624, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4765625, "step": 2588 }, { "completion_length": 131.75, "epoch": 1.3852327447833066, "grad_norm": 1.6161547899246216, "kl": 0.22853821516036987, "learning_rate": 3.258317917297108e-06, "loss": 0.0091, "reward": 2.1015625, "reward_std": 0.800905704498291, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4765625, "step": 2589 }, { "completion_length": 168.21875, "epoch": 1.3857677902621723, "grad_norm": 182.95542907714844, "kl": 48.441349029541016, "learning_rate": 3.256834624792321e-06, "loss": 1.9377, "reward": 1.625, "reward_std": 0.7924577593803406, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.421875, "step": 2590 }, { "completion_length": 110.125, "epoch": 1.3863028357410379, "grad_norm": 0.7299669981002808, "kl": 0.19748914241790771, "learning_rate": 3.255351038923932e-06, "loss": 0.0079, "reward": 2.67578125, "reward_std": 0.4211835265159607, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48828125, "step": 2591 }, { "completion_length": 117.03125, "epoch": 1.3868378812199036, "grad_norm": 1.0864671468734741, "kl": 0.2079845368862152, "learning_rate": 3.2538671602670057e-06, "loss": 0.0083, "reward": 2.467062473297119, "reward_std": 0.8106839656829834, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48268750309944153, "step": 2592 }, { "completion_length": 115.53125, "epoch": 1.3873729266987693, "grad_norm": 0.7424715161323547, "kl": 0.22601589560508728, "learning_rate": 3.2523829893967214e-06, "loss": 0.009, "reward": 2.50390625, "reward_std": 0.7879771590232849, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.48828125, "step": 2593 }, { "completion_length": 114.625, "epoch": 1.387907972177635, "grad_norm": 2.3165786266326904, "kl": 0.24146682024002075, "learning_rate": 3.250898526888373e-06, "loss": 0.0097, "reward": 2.09375, "reward_std": 0.7529441118240356, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.5, "step": 2594 }, { "completion_length": 122.96875, "epoch": 1.3884430176565008, "grad_norm": 0.8669283390045166, "kl": 0.19978642463684082, "learning_rate": 3.2494137733173664e-06, "loss": 0.008, "reward": 1.6875, "reward_std": 0.5904620885848999, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.5, "step": 2595 }, { "completion_length": 125.59375, "epoch": 1.3889780631353665, "grad_norm": 0.8183545470237732, "kl": 0.21077142655849457, "learning_rate": 3.2479287292592202e-06, "loss": 0.0084, "reward": 1.440406322479248, "reward_std": 0.4137195348739624, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48728126287460327, "step": 2596 }, { "completion_length": 142.375, "epoch": 1.3895131086142323, "grad_norm": 2.8878679275512695, "kl": 0.21246981620788574, "learning_rate": 3.246443395289567e-06, "loss": 0.0085, "reward": 1.8715312480926514, "reward_std": 0.48712268471717834, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.43403124809265137, "step": 2597 }, { "completion_length": 143.375, "epoch": 1.390048154093098, "grad_norm": 1.0431585311889648, "kl": 0.214985191822052, "learning_rate": 3.2449577719841495e-06, "loss": 0.0086, "reward": 1.4660313129425049, "reward_std": 0.500714123249054, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4816562533378601, "step": 2598 }, { "completion_length": 119.59375, "epoch": 1.3905831995719637, "grad_norm": 4.6180806159973145, "kl": 0.49125272035598755, "learning_rate": 3.243471859918824e-06, "loss": 0.0197, "reward": 2.12109375, "reward_std": 0.796364426612854, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.49609375, "step": 2599 }, { "completion_length": 148.875, "epoch": 1.3911182450508293, "grad_norm": 4.282297611236572, "kl": 0.2808091640472412, "learning_rate": 3.24198565966956e-06, "loss": 0.0112, "reward": 1.996187448501587, "reward_std": 1.0701231956481934, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4024375081062317, "step": 2600 }, { "completion_length": 133.84375, "epoch": 1.391653290529695, "grad_norm": 78.37530517578125, "kl": 29.17671012878418, "learning_rate": 3.2404991718124344e-06, "loss": 1.1671, "reward": 1.7108750343322754, "reward_std": 0.6405707001686096, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.476500004529953, "step": 2601 }, { "completion_length": 137.15625, "epoch": 1.3921883360085607, "grad_norm": 1.1142280101776123, "kl": 0.20040732622146606, "learning_rate": 3.239012396923641e-06, "loss": 0.008, "reward": 1.83984375, "reward_std": 0.822899580001831, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48046875, "step": 2602 }, { "completion_length": 131.5625, "epoch": 1.3927233814874265, "grad_norm": 89.60806274414062, "kl": 0.39084455370903015, "learning_rate": 3.2375253355794807e-06, "loss": 0.0156, "reward": 2.649531364440918, "reward_std": 0.6222104430198669, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4307812452316284, "step": 2603 }, { "completion_length": 136.90625, "epoch": 1.3932584269662922, "grad_norm": 0.8606239557266235, "kl": 0.1921016275882721, "learning_rate": 3.2360379883563664e-06, "loss": 0.0077, "reward": 2.0678749084472656, "reward_std": 0.7372783422470093, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4428749978542328, "step": 2604 }, { "completion_length": 135.59375, "epoch": 1.3937934724451577, "grad_norm": 0.7249608635902405, "kl": 0.19613365828990936, "learning_rate": 3.234550355830825e-06, "loss": 0.0078, "reward": 2.0169687271118164, "reward_std": 0.38070374727249146, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4857187569141388, "step": 2605 }, { "completion_length": 129.1875, "epoch": 1.3943285179240235, "grad_norm": 0.633571207523346, "kl": 0.1552739143371582, "learning_rate": 3.2330624385794895e-06, "loss": 0.0062, "reward": 2.2232813835144043, "reward_std": 0.6742395162582397, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48890626430511475, "step": 2606 }, { "completion_length": 123.15625, "epoch": 1.3948635634028892, "grad_norm": 7.8661627769470215, "kl": 0.5016246438026428, "learning_rate": 3.231574237179105e-06, "loss": 0.0201, "reward": 2.0038437843322754, "reward_std": 0.702610969543457, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.472593754529953, "step": 2607 }, { "completion_length": 120.09375, "epoch": 1.395398608881755, "grad_norm": 1.7761107683181763, "kl": 0.17844349145889282, "learning_rate": 3.230085752206527e-06, "loss": 0.0071, "reward": 2.409343719482422, "reward_std": 0.5361908078193665, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48746874928474426, "step": 2608 }, { "completion_length": 119.15625, "epoch": 1.3959336543606207, "grad_norm": 1.7073944807052612, "kl": 0.16348518431186676, "learning_rate": 3.228596984238722e-06, "loss": 0.0065, "reward": 1.5, "reward_std": 0.3858202397823334, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.5, "step": 2609 }, { "completion_length": 136.5, "epoch": 1.3964686998394864, "grad_norm": 445.29962158203125, "kl": 1.387444257736206, "learning_rate": 3.227107933852763e-06, "loss": 0.0555, "reward": 1.953125, "reward_std": 0.9820161461830139, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46875, "step": 2610 }, { "completion_length": 126.03125, "epoch": 1.3970037453183521, "grad_norm": 1.6934407949447632, "kl": 0.20327700674533844, "learning_rate": 3.2256186016258363e-06, "loss": 0.0081, "reward": 1.9647188186645508, "reward_std": 0.6655980348587036, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.480343759059906, "step": 2611 }, { "completion_length": 130.28125, "epoch": 1.3975387907972179, "grad_norm": 7469.5712890625, "kl": 90.05014038085938, "learning_rate": 3.224128988135235e-06, "loss": 3.602, "reward": 2.33984375, "reward_std": 0.8830574154853821, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48046875, "step": 2612 }, { "completion_length": 101.625, "epoch": 1.3980738362760834, "grad_norm": 0.6868873238563538, "kl": 0.15779510140419006, "learning_rate": 3.2226390939583607e-06, "loss": 0.0063, "reward": 2.765625, "reward_std": 0.3874039351940155, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.5, "step": 2613 }, { "completion_length": 114.625, "epoch": 1.3986088817549491, "grad_norm": 0.4642166197299957, "kl": 0.20802289247512817, "learning_rate": 3.2211489196727263e-06, "loss": 0.0083, "reward": 2.1123125553131104, "reward_std": 0.5588467121124268, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4716874957084656, "step": 2614 }, { "completion_length": 129.34375, "epoch": 1.3991439272338149, "grad_norm": 0.8363798260688782, "kl": 0.14704503118991852, "learning_rate": 3.2196584658559525e-06, "loss": 0.0059, "reward": 2.702312469482422, "reward_std": 0.9853585362434387, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.49918749928474426, "step": 2615 }, { "completion_length": 122.625, "epoch": 1.3996789727126806, "grad_norm": 0.7245301604270935, "kl": 0.2208673655986786, "learning_rate": 3.218167733085766e-06, "loss": 0.0088, "reward": 2.515625, "reward_std": 0.43611252307891846, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.5, "step": 2616 }, { "completion_length": 125.6875, "epoch": 1.4002140181915463, "grad_norm": 1.3974380493164062, "kl": 0.20075367391109467, "learning_rate": 3.2166767219400043e-06, "loss": 0.008, "reward": 2.1372811794281006, "reward_std": 0.8872458338737488, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.48103123903274536, "step": 2617 }, { "completion_length": 142.25, "epoch": 1.4007490636704119, "grad_norm": 0.7594618797302246, "kl": 0.1934889554977417, "learning_rate": 3.2151854329966116e-06, "loss": 0.0077, "reward": 1.3615624904632568, "reward_std": 0.5963910818099976, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4709375202655792, "step": 2618 }, { "completion_length": 162.25, "epoch": 1.4012841091492776, "grad_norm": 1.1122941970825195, "kl": 0.12735530734062195, "learning_rate": 3.2136938668336403e-06, "loss": 0.0051, "reward": 1.415656328201294, "reward_std": 0.785553514957428, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.38440626859664917, "step": 2619 }, { "completion_length": 124.34375, "epoch": 1.4018191546281433, "grad_norm": 0.9072118997573853, "kl": 0.15572363138198853, "learning_rate": 3.2122020240292507e-06, "loss": 0.0062, "reward": 2.6640625, "reward_std": 0.6017240285873413, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4765625, "step": 2620 }, { "completion_length": 130.25, "epoch": 1.402354200107009, "grad_norm": 0.6028137803077698, "kl": 0.14904139935970306, "learning_rate": 3.2107099051617073e-06, "loss": 0.006, "reward": 2.859375, "reward_std": 0.26252126693725586, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 2621 }, { "completion_length": 110.5, "epoch": 1.4028892455858748, "grad_norm": 2.624769449234009, "kl": 0.23745806515216827, "learning_rate": 3.209217510809387e-06, "loss": 0.0095, "reward": 2.42578125, "reward_std": 0.520991325378418, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48828125, "step": 2622 }, { "completion_length": 137.65625, "epoch": 1.4034242910647405, "grad_norm": 2.2338593006134033, "kl": 0.16068026423454285, "learning_rate": 3.207724841550769e-06, "loss": 0.0064, "reward": 2.0703125, "reward_std": 0.6403318643569946, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4765625, "step": 2623 }, { "completion_length": 131.53125, "epoch": 1.4039593365436063, "grad_norm": 1.5382739305496216, "kl": 0.2723158001899719, "learning_rate": 3.2062318979644396e-06, "loss": 0.0109, "reward": 2.7964999675750732, "reward_std": 1.014034628868103, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.49962499737739563, "step": 2624 }, { "completion_length": 133.90625, "epoch": 1.404494382022472, "grad_norm": 1.3667508363723755, "kl": 0.21536868810653687, "learning_rate": 3.204738680629094e-06, "loss": 0.0086, "reward": 2.4358437061309814, "reward_std": 1.2547005414962769, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4827187657356262, "step": 2625 }, { "completion_length": 141.59375, "epoch": 1.4050294275013377, "grad_norm": 2.3966376781463623, "kl": 0.2597073018550873, "learning_rate": 3.2032451901235316e-06, "loss": 0.0104, "reward": 1.426687479019165, "reward_std": 0.3827519416809082, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.28125, "rewards/xmlcount_reward_func": 0.3954375088214874, "step": 2626 }, { "completion_length": 141.53125, "epoch": 1.4055644729802033, "grad_norm": 0.9326630234718323, "kl": 0.15728577971458435, "learning_rate": 3.201751427026657e-06, "loss": 0.0063, "reward": 1.90234375, "reward_std": 0.8237638473510742, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46484375, "step": 2627 }, { "completion_length": 139.5625, "epoch": 1.406099518459069, "grad_norm": 1.6916404962539673, "kl": 0.2579905390739441, "learning_rate": 3.2002573919174817e-06, "loss": 0.0103, "reward": 1.570156216621399, "reward_std": 0.979966402053833, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4295312464237213, "step": 2628 }, { "completion_length": 108.53125, "epoch": 1.4066345639379347, "grad_norm": 1.1810566186904907, "kl": 0.20142434537410736, "learning_rate": 3.198763085375123e-06, "loss": 0.0081, "reward": 2.6174373626708984, "reward_std": 0.7740830183029175, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4924375116825104, "step": 2629 }, { "completion_length": 142.15625, "epoch": 1.4071696094168005, "grad_norm": 8.583405494689941, "kl": 0.828806459903717, "learning_rate": 3.1972685079788017e-06, "loss": 0.0332, "reward": 2.4296875, "reward_std": 1.2263081073760986, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4765625, "step": 2630 }, { "completion_length": 115.75, "epoch": 1.4077046548956662, "grad_norm": 1.1686242818832397, "kl": 0.21142518520355225, "learning_rate": 3.1957736603078458e-06, "loss": 0.0085, "reward": 2.1418750286102295, "reward_std": 0.6220353245735168, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4700000286102295, "step": 2631 }, { "completion_length": 125.46875, "epoch": 1.4082397003745317, "grad_norm": 0.8474605083465576, "kl": 0.20782697200775146, "learning_rate": 3.1942785429416854e-06, "loss": 0.0083, "reward": 1.7534687519073486, "reward_std": 0.8343859910964966, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.47221875190734863, "step": 2632 }, { "completion_length": 109.0625, "epoch": 1.4087747458533975, "grad_norm": 1.9825823307037354, "kl": 0.172119602560997, "learning_rate": 3.1927831564598585e-06, "loss": 0.0069, "reward": 2.6650311946868896, "reward_std": 0.5453781485557556, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4931562542915344, "step": 2633 }, { "completion_length": 114.3125, "epoch": 1.4093097913322632, "grad_norm": 245289156608.0, "kl": 3725931264.0, "learning_rate": 3.1912875014420044e-06, "loss": 149037248.0, "reward": 2.3189375400543213, "reward_std": 0.6855499744415283, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4595625102519989, "step": 2634 }, { "completion_length": 133.8125, "epoch": 1.409844836811129, "grad_norm": 1.1168683767318726, "kl": 0.28117239475250244, "learning_rate": 3.189791578467868e-06, "loss": 0.0112, "reward": 2.200437545776367, "reward_std": 0.37698784470558167, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4348125159740448, "step": 2635 }, { "completion_length": 141.71875, "epoch": 1.4103798822899947, "grad_norm": 0.7071778774261475, "kl": 0.1424272656440735, "learning_rate": 3.1882953881172974e-06, "loss": 0.0057, "reward": 1.8926563262939453, "reward_std": 0.8994118571281433, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47078126668930054, "step": 2636 }, { "completion_length": 136.65625, "epoch": 1.4109149277688604, "grad_norm": 0.7296389937400818, "kl": 0.23629450798034668, "learning_rate": 3.186798930970244e-06, "loss": 0.0095, "reward": 1.8198437690734863, "reward_std": 0.732750654220581, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.47609373927116394, "step": 2637 }, { "completion_length": 136.75, "epoch": 1.4114499732477261, "grad_norm": 1.0920792818069458, "kl": 0.1472052037715912, "learning_rate": 3.185302207606765e-06, "loss": 0.0059, "reward": 1.81640625, "reward_std": 0.5247732400894165, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.48828125, "step": 2638 }, { "completion_length": 156.84375, "epoch": 1.4119850187265919, "grad_norm": 1.6617820262908936, "kl": 0.1250755339860916, "learning_rate": 3.1838052186070185e-06, "loss": 0.005, "reward": 1.6054375171661377, "reward_std": 0.8951104879379272, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4179374873638153, "step": 2639 }, { "completion_length": 131.34375, "epoch": 1.4125200642054574, "grad_norm": 1.9376215934753418, "kl": 0.21085000038146973, "learning_rate": 3.1823079645512657e-06, "loss": 0.0084, "reward": 2.132312536239624, "reward_std": 0.9433231949806213, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.42918750643730164, "step": 2640 }, { "completion_length": 117.96875, "epoch": 1.4130551096843231, "grad_norm": 0.9373258948326111, "kl": 0.15835300087928772, "learning_rate": 3.1808104460198702e-06, "loss": 0.0063, "reward": 2.7378125190734863, "reward_std": 0.5963742733001709, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.48781251907348633, "step": 2641 }, { "completion_length": 146.09375, "epoch": 1.4135901551631889, "grad_norm": 2.4633326530456543, "kl": 0.1490173041820526, "learning_rate": 3.1793126635932997e-06, "loss": 0.006, "reward": 1.8110312223434448, "reward_std": 0.8004682064056396, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4516562521457672, "step": 2642 }, { "completion_length": 123.53125, "epoch": 1.4141252006420546, "grad_norm": 3.3930604457855225, "kl": 0.23612546920776367, "learning_rate": 3.177814617852124e-06, "loss": 0.0094, "reward": 2.5102500915527344, "reward_std": 0.6085602045059204, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4946250021457672, "step": 2643 }, { "completion_length": 105.40625, "epoch": 1.4146602461209203, "grad_norm": 2.889880657196045, "kl": 0.24978488683700562, "learning_rate": 3.1763163093770137e-06, "loss": 0.01, "reward": 2.28125, "reward_std": 0.466095507144928, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.5, "step": 2644 }, { "completion_length": 126.0625, "epoch": 1.4151952915997859, "grad_norm": 0.8821243047714233, "kl": 0.28161364793777466, "learning_rate": 3.1748177387487422e-06, "loss": 0.0113, "reward": 2.5443124771118164, "reward_std": 0.7084455490112305, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4818125069141388, "step": 2645 }, { "completion_length": 135.96875, "epoch": 1.4157303370786516, "grad_norm": 1.044907808303833, "kl": 0.19256813824176788, "learning_rate": 3.1733189065481836e-06, "loss": 0.0077, "reward": 1.3325624465942383, "reward_std": 0.4917612075805664, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.47318750619888306, "step": 2646 }, { "completion_length": 136.0625, "epoch": 1.4162653825575173, "grad_norm": 1.069688081741333, "kl": 0.16007797420024872, "learning_rate": 3.1718198133563133e-06, "loss": 0.0064, "reward": 1.9254062175750732, "reward_std": 0.7120649814605713, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.47228124737739563, "step": 2647 }, { "completion_length": 138.1875, "epoch": 1.416800428036383, "grad_norm": 0.9062031507492065, "kl": 0.15453065931797028, "learning_rate": 3.17032045975421e-06, "loss": 0.0062, "reward": 2.03125, "reward_std": 0.8571276068687439, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.5, "step": 2648 }, { "completion_length": 138.625, "epoch": 1.4173354735152488, "grad_norm": 0.895904004573822, "kl": 0.14885565638542175, "learning_rate": 3.168820846323053e-06, "loss": 0.006, "reward": 1.799218773841858, "reward_std": 0.8529830574989319, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4867187440395355, "step": 2649 }, { "completion_length": 141.125, "epoch": 1.4178705189941145, "grad_norm": 1.4436293840408325, "kl": 0.20802462100982666, "learning_rate": 3.167320973644118e-06, "loss": 0.0083, "reward": 1.3635938167572021, "reward_std": 0.4724034070968628, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.441718727350235, "step": 2650 }, { "completion_length": 149.75, "epoch": 1.4184055644729803, "grad_norm": 1.528080701828003, "kl": 0.15324766933918, "learning_rate": 3.1658208422987856e-06, "loss": 0.0061, "reward": 1.4412813186645508, "reward_std": 0.6295961141586304, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.472531259059906, "step": 2651 }, { "completion_length": 118.5, "epoch": 1.418940609951846, "grad_norm": 0.6429635882377625, "kl": 0.19648799300193787, "learning_rate": 3.1643204528685374e-06, "loss": 0.0079, "reward": 1.7455313205718994, "reward_std": 0.8098005652427673, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.47990626096725464, "step": 2652 }, { "completion_length": 137.4375, "epoch": 1.4194756554307117, "grad_norm": 0.9116630554199219, "kl": 0.1697566658258438, "learning_rate": 3.16281980593495e-06, "loss": 0.0068, "reward": 1.718093752861023, "reward_std": 0.662113606929779, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46809375286102295, "step": 2653 }, { "completion_length": 139.6875, "epoch": 1.4200107009095773, "grad_norm": 2.554290533065796, "kl": 0.16510623693466187, "learning_rate": 3.1613189020797057e-06, "loss": 0.0066, "reward": 2.3763749599456787, "reward_std": 0.9007653594017029, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4701249897480011, "step": 2654 }, { "completion_length": 134.78125, "epoch": 1.420545746388443, "grad_norm": 1.040204644203186, "kl": 0.22476711869239807, "learning_rate": 3.159817741884582e-06, "loss": 0.009, "reward": 1.769281268119812, "reward_std": 0.6301116347312927, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.488031268119812, "step": 2655 }, { "completion_length": 127.4375, "epoch": 1.4210807918673087, "grad_norm": 1.2729921340942383, "kl": 0.18346378207206726, "learning_rate": 3.1583163259314577e-06, "loss": 0.0073, "reward": 2.1439061164855957, "reward_std": 0.794898271560669, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.48765623569488525, "step": 2656 }, { "completion_length": 143.625, "epoch": 1.4216158373461745, "grad_norm": 4.524959564208984, "kl": 0.24879367649555206, "learning_rate": 3.1568146548023105e-06, "loss": 0.01, "reward": 1.7734375, "reward_std": 0.6584341526031494, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4765625, "step": 2657 }, { "completion_length": 123.125, "epoch": 1.4221508828250402, "grad_norm": 0.6760925650596619, "kl": 0.15692143142223358, "learning_rate": 3.1553127290792162e-06, "loss": 0.0063, "reward": 1.9609375, "reward_std": 0.5733954906463623, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.4921875, "step": 2658 }, { "completion_length": 124.125, "epoch": 1.4226859283039057, "grad_norm": 0.8941085934638977, "kl": 0.18488416075706482, "learning_rate": 3.153810549344351e-06, "loss": 0.0074, "reward": 2.2734375, "reward_std": 1.0021073818206787, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4765625, "step": 2659 }, { "completion_length": 129.21875, "epoch": 1.4232209737827715, "grad_norm": 0.9776930809020996, "kl": 0.2653927505016327, "learning_rate": 3.1523081161799882e-06, "loss": 0.0106, "reward": 2.296687602996826, "reward_std": 0.8495648503303528, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.499812513589859, "step": 2660 }, { "completion_length": 140.875, "epoch": 1.4237560192616372, "grad_norm": 878630080.0, "kl": 710114.4375, "learning_rate": 3.1508054301684994e-06, "loss": 28404.584, "reward": 1.7473125457763672, "reward_std": 0.46383920311927795, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4660625159740448, "step": 2661 }, { "completion_length": 138.125, "epoch": 1.424291064740503, "grad_norm": 2.2172834873199463, "kl": 0.23518787324428558, "learning_rate": 3.1493024918923555e-06, "loss": 0.0094, "reward": 1.70703125, "reward_std": 0.6897885203361511, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.48828125, "step": 2662 }, { "completion_length": 128.03125, "epoch": 1.4248261102193687, "grad_norm": 1.9221043586730957, "kl": 0.2138238251209259, "learning_rate": 3.147799301934123e-06, "loss": 0.0086, "reward": 1.84765625, "reward_std": 0.3553834557533264, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48828125, "step": 2663 }, { "completion_length": 132.0, "epoch": 1.4253611556982344, "grad_norm": 2.76596999168396, "kl": 0.199199378490448, "learning_rate": 3.146295860876467e-06, "loss": 0.008, "reward": 2.031125068664551, "reward_std": 0.660265326499939, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.468625009059906, "step": 2664 }, { "completion_length": 155.75, "epoch": 1.4258962011771001, "grad_norm": 1.739395260810852, "kl": 0.22068271040916443, "learning_rate": 3.144792169302151e-06, "loss": 0.0088, "reward": 2.125, "reward_std": 1.0178557634353638, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.421875, "step": 2665 }, { "completion_length": 139.875, "epoch": 1.4264312466559659, "grad_norm": 0.7239546179771423, "kl": 0.1642371118068695, "learning_rate": 3.1432882277940347e-06, "loss": 0.0066, "reward": 1.7129688262939453, "reward_std": 0.7201026678085327, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.44734376668930054, "step": 2666 }, { "completion_length": 141.75, "epoch": 1.4269662921348314, "grad_norm": 1.000302791595459, "kl": 0.14406254887580872, "learning_rate": 3.1417840369350734e-06, "loss": 0.0058, "reward": 1.7734375, "reward_std": 0.7129366397857666, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4765625, "step": 2667 }, { "completion_length": 125.03125, "epoch": 1.4275013376136971, "grad_norm": 0.6752169728279114, "kl": 0.13243339955806732, "learning_rate": 3.1402795973083215e-06, "loss": 0.0053, "reward": 1.7578125, "reward_std": 0.36986804008483887, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4921875, "step": 2668 }, { "completion_length": 119.84375, "epoch": 1.4280363830925629, "grad_norm": 0.2791626751422882, "kl": 0.14326390624046326, "learning_rate": 3.1387749094969287e-06, "loss": 0.0057, "reward": 2.25, "reward_std": 0.26726123690605164, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 2669 }, { "completion_length": 152.6875, "epoch": 1.4285714285714286, "grad_norm": 1.0131208896636963, "kl": 0.21722230315208435, "learning_rate": 3.1372699740841393e-06, "loss": 0.0087, "reward": 1.3259062767028809, "reward_std": 0.6587094068527222, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.43528124690055847, "step": 2670 }, { "completion_length": 99.40625, "epoch": 1.4291064740502943, "grad_norm": 1.086410641670227, "kl": 0.2178860306739807, "learning_rate": 3.1357647916532974e-06, "loss": 0.0087, "reward": 2.2106876373291016, "reward_std": 0.482843279838562, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.491937518119812, "step": 2671 }, { "completion_length": 127.59375, "epoch": 1.4296415195291599, "grad_norm": 1.3839794397354126, "kl": 0.16493354737758636, "learning_rate": 3.134259362787839e-06, "loss": 0.0066, "reward": 2.0367813110351562, "reward_std": 1.0682199001312256, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4430312514305115, "step": 2672 }, { "completion_length": 120.65625, "epoch": 1.4301765650080256, "grad_norm": 1.5535703897476196, "kl": 0.17665588855743408, "learning_rate": 3.1327536880712973e-06, "loss": 0.0071, "reward": 2.25, "reward_std": 0.585008442401886, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 2673 }, { "completion_length": 123.5, "epoch": 1.4307116104868913, "grad_norm": 0.5433176159858704, "kl": 0.14707213640213013, "learning_rate": 3.131247768087301e-06, "loss": 0.0059, "reward": 1.5484062433242798, "reward_std": 0.3848605751991272, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.4859062433242798, "step": 2674 }, { "completion_length": 128.03125, "epoch": 1.431246655965757, "grad_norm": 1.52193284034729, "kl": 0.17863450944423676, "learning_rate": 3.129741603419573e-06, "loss": 0.0071, "reward": 2.13671875, "reward_std": 0.6428536772727966, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48046875, "step": 2675 }, { "completion_length": 113.46875, "epoch": 1.4317817014446228, "grad_norm": 3.7978827953338623, "kl": 0.30455929040908813, "learning_rate": 3.128235194651932e-06, "loss": 0.0122, "reward": 1.765625, "reward_std": 0.2643740773200989, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.5, "step": 2676 }, { "completion_length": 129.96875, "epoch": 1.4323167469234885, "grad_norm": 0.46590232849121094, "kl": 0.13560475409030914, "learning_rate": 3.1267285423682912e-06, "loss": 0.0054, "reward": 2.434718608856201, "reward_std": 0.6773616075515747, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4503437578678131, "step": 2677 }, { "completion_length": 111.96875, "epoch": 1.4328517924023543, "grad_norm": 0.5271268486976624, "kl": 0.18676692247390747, "learning_rate": 3.1252216471526574e-06, "loss": 0.0075, "reward": 1.964343786239624, "reward_std": 0.25788167119026184, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.49559375643730164, "step": 2678 }, { "completion_length": 137.3125, "epoch": 1.43338683788122, "grad_norm": 1.1066330671310425, "kl": 0.18892893195152283, "learning_rate": 3.123714509589133e-06, "loss": 0.0076, "reward": 1.726656198501587, "reward_std": 0.6138355731964111, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4610312581062317, "step": 2679 }, { "completion_length": 157.71875, "epoch": 1.4339218833600855, "grad_norm": 0.5861799716949463, "kl": 0.131764754652977, "learning_rate": 3.122207130261912e-06, "loss": 0.0053, "reward": 1.820718765258789, "reward_std": 1.257490873336792, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.39884376525878906, "step": 2680 }, { "completion_length": 122.09375, "epoch": 1.4344569288389513, "grad_norm": 1.1202703714370728, "kl": 0.1593080461025238, "learning_rate": 3.1206995097552843e-06, "loss": 0.0064, "reward": 2.09765625, "reward_std": 0.8890477418899536, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48828125, "step": 2681 }, { "completion_length": 117.375, "epoch": 1.434991974317817, "grad_norm": 1.9902827739715576, "kl": 0.24343344569206238, "learning_rate": 3.1191916486536332e-06, "loss": 0.0097, "reward": 2.729562520980835, "reward_std": 1.0271886587142944, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4951874911785126, "step": 2682 }, { "completion_length": 125.5625, "epoch": 1.4355270197966827, "grad_norm": 0.9679769277572632, "kl": 0.18003153800964355, "learning_rate": 3.1176835475414336e-06, "loss": 0.0072, "reward": 2.2968125343322754, "reward_std": 0.7253851890563965, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.499937504529953, "step": 2683 }, { "completion_length": 126.875, "epoch": 1.4360620652755485, "grad_norm": 1.0906025171279907, "kl": 0.17749151587486267, "learning_rate": 3.1161752070032537e-06, "loss": 0.0071, "reward": 2.6242499351501465, "reward_std": 0.7418640851974487, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.49924999475479126, "step": 2684 }, { "completion_length": 124.71875, "epoch": 1.4365971107544142, "grad_norm": 3.6390929222106934, "kl": 0.32357141375541687, "learning_rate": 3.1146666276237557e-06, "loss": 0.0129, "reward": 3.234375, "reward_std": 0.7513009309768677, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 2685 }, { "completion_length": 145.1875, "epoch": 1.4371321562332797, "grad_norm": 0.6040043830871582, "kl": 0.20263665914535522, "learning_rate": 3.1131578099876957e-06, "loss": 0.0081, "reward": 1.7619062662124634, "reward_std": 0.6331698894500732, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4025312662124634, "step": 2686 }, { "completion_length": 120.3125, "epoch": 1.4376672017121455, "grad_norm": 0.9975663423538208, "kl": 0.18496258556842804, "learning_rate": 3.111648754679918e-06, "loss": 0.0074, "reward": 1.3629686832427979, "reward_std": 0.4507412314414978, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4879687428474426, "step": 2687 }, { "completion_length": 119.75, "epoch": 1.4382022471910112, "grad_norm": 2.236823797225952, "kl": 0.1885615587234497, "learning_rate": 3.110139462285362e-06, "loss": 0.0075, "reward": 2.3046875, "reward_std": 0.9070607423782349, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4765625, "step": 2688 }, { "completion_length": 132.53125, "epoch": 1.438737292669877, "grad_norm": 0.8355998396873474, "kl": 0.18414419889450073, "learning_rate": 3.1086299333890586e-06, "loss": 0.0074, "reward": 2.1804375648498535, "reward_std": 0.7541989088058472, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.49293750524520874, "step": 2689 }, { "completion_length": 111.0625, "epoch": 1.4392723381487427, "grad_norm": 1.001505732536316, "kl": 0.18123728036880493, "learning_rate": 3.1071201685761304e-06, "loss": 0.0072, "reward": 2.98256254196167, "reward_std": 1.0184078216552734, "rewards/correctness_reward_func": 1.5625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48256251215934753, "step": 2690 }, { "completion_length": 101.125, "epoch": 1.4398073836276084, "grad_norm": 0.24701198935508728, "kl": 0.20991650223731995, "learning_rate": 3.1056101684317907e-06, "loss": 0.0084, "reward": 2.5625, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 2691 }, { "completion_length": 129.28125, "epoch": 1.4403424291064741, "grad_norm": 1.9649972915649414, "kl": 0.18199491500854492, "learning_rate": 3.104099933541345e-06, "loss": 0.0073, "reward": 2.65625, "reward_std": 0.706595242023468, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 2692 }, { "completion_length": 131.90625, "epoch": 1.4408774745853399, "grad_norm": 1.5256544351577759, "kl": 0.18508028984069824, "learning_rate": 3.102589464490188e-06, "loss": 0.0074, "reward": 1.8537812232971191, "reward_std": 0.871503472328186, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.44753125309944153, "step": 2693 }, { "completion_length": 141.53125, "epoch": 1.4414125200642054, "grad_norm": 1.0679700374603271, "kl": 0.15054211020469666, "learning_rate": 3.101078761863808e-06, "loss": 0.006, "reward": 1.84375, "reward_std": 0.6643523573875427, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.484375, "step": 2694 }, { "completion_length": 145.875, "epoch": 1.4419475655430711, "grad_norm": 1.6600682735443115, "kl": 0.18513740599155426, "learning_rate": 3.099567826247781e-06, "loss": 0.0074, "reward": 2.05078125, "reward_std": 1.0267670154571533, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.44140625, "step": 2695 }, { "completion_length": 127.46875, "epoch": 1.4424826110219369, "grad_norm": 1.4117361307144165, "kl": 0.23881207406520844, "learning_rate": 3.0980566582277766e-06, "loss": 0.0096, "reward": 2.2578125, "reward_std": 0.6907069683074951, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4765625, "step": 2696 }, { "completion_length": 139.09375, "epoch": 1.4430176565008026, "grad_norm": 0.7286036014556885, "kl": 0.20857155323028564, "learning_rate": 3.0965452583895513e-06, "loss": 0.0083, "reward": 1.6250312328338623, "reward_std": 0.8759803771972656, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4375312328338623, "step": 2697 }, { "completion_length": 140.65625, "epoch": 1.4435527019796683, "grad_norm": 1143793.625, "kl": 39354.54296875, "learning_rate": 3.095033627318951e-06, "loss": 1574.1813, "reward": 2.010718822479248, "reward_std": 0.9577707052230835, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.47946876287460327, "step": 2698 }, { "completion_length": 124.375, "epoch": 1.4440877474585339, "grad_norm": 5.198607921600342, "kl": 0.3917187750339508, "learning_rate": 3.0935217656019145e-06, "loss": 0.0157, "reward": 2.4403750896453857, "reward_std": 0.863897442817688, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4716250002384186, "step": 2699 }, { "completion_length": 122.09375, "epoch": 1.4446227929373996, "grad_norm": 1.3863040208816528, "kl": 0.18705809116363525, "learning_rate": 3.0920096738244693e-06, "loss": 0.0075, "reward": 2.7528748512268066, "reward_std": 0.9635570049285889, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4872500002384186, "step": 2700 }, { "completion_length": 144.4375, "epoch": 1.4451578384162653, "grad_norm": 2.1338346004486084, "kl": 0.18427518010139465, "learning_rate": 3.090497352572729e-06, "loss": 0.0074, "reward": 1.4156875610351562, "reward_std": 0.8583076000213623, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4156875014305115, "step": 2701 }, { "completion_length": 119.875, "epoch": 1.445692883895131, "grad_norm": 1.6304254531860352, "kl": 0.22342625260353088, "learning_rate": 3.0889848024329e-06, "loss": 0.0089, "reward": 2.5843124389648438, "reward_std": 0.51277095079422, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4593124985694885, "step": 2702 }, { "completion_length": 122.6875, "epoch": 1.4462279293739968, "grad_norm": 974847.0, "kl": 200091.59375, "learning_rate": 3.087472023991275e-06, "loss": 8003.6641, "reward": 2.272125005722046, "reward_std": 1.101428508758545, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4752500057220459, "step": 2703 }, { "completion_length": 125.0625, "epoch": 1.4467629748528625, "grad_norm": 3.5989153385162354, "kl": 0.18341007828712463, "learning_rate": 3.0859590178342356e-06, "loss": 0.0073, "reward": 2.1875, "reward_std": 0.41351333260536194, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.5, "step": 2704 }, { "completion_length": 125.03125, "epoch": 1.4472980203317283, "grad_norm": 0.6349268555641174, "kl": 0.14691326022148132, "learning_rate": 3.084445784548253e-06, "loss": 0.0059, "reward": 2.2384374141693115, "reward_std": 0.9419565200805664, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4728124737739563, "step": 2705 }, { "completion_length": 139.03125, "epoch": 1.447833065810594, "grad_norm": 1.0996708869934082, "kl": 0.19442355632781982, "learning_rate": 3.082932324719885e-06, "loss": 0.0078, "reward": 2.83203125, "reward_std": 1.010793924331665, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.48828125, "step": 2706 }, { "completion_length": 132.28125, "epoch": 1.4483681112894595, "grad_norm": 1.5048779249191284, "kl": 0.20610320568084717, "learning_rate": 3.0814186389357765e-06, "loss": 0.0082, "reward": 1.813406229019165, "reward_std": 0.6242954730987549, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.48528122901916504, "step": 2707 }, { "completion_length": 103.5, "epoch": 1.4489031567683253, "grad_norm": 1.5522874593734741, "kl": 0.21434634923934937, "learning_rate": 3.079904727782662e-06, "loss": 0.0086, "reward": 2.58203125, "reward_std": 0.8436408042907715, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48828125, "step": 2708 }, { "completion_length": 133.78125, "epoch": 1.449438202247191, "grad_norm": 0.6344096660614014, "kl": 0.20074810087680817, "learning_rate": 3.078390591847363e-06, "loss": 0.008, "reward": 1.9804375171661377, "reward_std": 0.7545703649520874, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4491875171661377, "step": 2709 }, { "completion_length": 140.8125, "epoch": 1.4499732477260567, "grad_norm": 1.183911681175232, "kl": 0.17117735743522644, "learning_rate": 3.0768762317167876e-06, "loss": 0.0068, "reward": 1.352468729019165, "reward_std": 0.6988143920898438, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.43059372901916504, "step": 2710 }, { "completion_length": 135.71875, "epoch": 1.4505082932049225, "grad_norm": 14.83144760131836, "kl": 0.36549264192581177, "learning_rate": 3.07536164797793e-06, "loss": 0.0146, "reward": 2.056999921798706, "reward_std": 0.7736561298370361, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.44762498140335083, "step": 2711 }, { "completion_length": 138.03125, "epoch": 1.451043338683788, "grad_norm": 0.9265734553337097, "kl": 0.3407841920852661, "learning_rate": 3.073846841217872e-06, "loss": 0.0136, "reward": 2.407656192779541, "reward_std": 0.8035178780555725, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4701562523841858, "step": 2712 }, { "completion_length": 125.96875, "epoch": 1.4515783841626537, "grad_norm": 1.370771050453186, "kl": 0.23171553015708923, "learning_rate": 3.0723318120237817e-06, "loss": 0.0093, "reward": 2.5166876316070557, "reward_std": 0.6286351680755615, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4854375123977661, "step": 2713 }, { "completion_length": 117.03125, "epoch": 1.4521134296415195, "grad_norm": 0.5292516946792603, "kl": 0.15929031372070312, "learning_rate": 3.0708165609829142e-06, "loss": 0.0064, "reward": 2.315906286239624, "reward_std": 0.284197598695755, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48778125643730164, "step": 2714 }, { "completion_length": 119.34375, "epoch": 1.4526484751203852, "grad_norm": 0.48098406195640564, "kl": 0.13491907715797424, "learning_rate": 3.069301088682609e-06, "loss": 0.0054, "reward": 2.256999969482422, "reward_std": 0.35193878412246704, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47574999928474426, "step": 2715 }, { "completion_length": 97.5, "epoch": 1.453183520599251, "grad_norm": 1.0498608350753784, "kl": 0.1908194124698639, "learning_rate": 3.0677853957102928e-06, "loss": 0.0076, "reward": 2.261593818664551, "reward_std": 0.8147416114807129, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.495968759059906, "step": 2716 }, { "completion_length": 169.34375, "epoch": 1.4537185660781167, "grad_norm": 0.5489917397499084, "kl": 0.12229283154010773, "learning_rate": 3.0662694826534773e-06, "loss": 0.0049, "reward": 1.1540000438690186, "reward_std": 0.6966162919998169, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.3571249842643738, "step": 2717 }, { "completion_length": 104.4375, "epoch": 1.4542536115569824, "grad_norm": 1.3015283346176147, "kl": 0.18319594860076904, "learning_rate": 3.064753350099758e-06, "loss": 0.0073, "reward": 1.8147187232971191, "reward_std": 0.7340071201324463, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48659375309944153, "step": 2718 }, { "completion_length": 115.65625, "epoch": 1.4547886570358481, "grad_norm": 0.8875210285186768, "kl": 0.25935810804367065, "learning_rate": 3.0632369986368188e-06, "loss": 0.0104, "reward": 2.140625, "reward_std": 0.33284348249435425, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.5, "step": 2719 }, { "completion_length": 125.15625, "epoch": 1.4553237025147139, "grad_norm": 1.160335659980774, "kl": 0.22944894433021545, "learning_rate": 3.061720428852426e-06, "loss": 0.0092, "reward": 1.4586563110351562, "reward_std": 0.4722036123275757, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4742812514305115, "step": 2720 }, { "completion_length": 159.0625, "epoch": 1.4558587479935794, "grad_norm": 1.138526439666748, "kl": 0.1517876386642456, "learning_rate": 3.0602036413344305e-06, "loss": 0.0061, "reward": 1.9116562604904175, "reward_std": 0.8044588565826416, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4585312604904175, "step": 2721 }, { "completion_length": 126.09375, "epoch": 1.4563937934724451, "grad_norm": 1.0806487798690796, "kl": 0.2389419674873352, "learning_rate": 3.0586866366707686e-06, "loss": 0.0096, "reward": 2.40625, "reward_std": 1.060610055923462, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.5, "step": 2722 }, { "completion_length": 129.71875, "epoch": 1.4569288389513109, "grad_norm": 1.4595324993133545, "kl": 0.182011216878891, "learning_rate": 3.057169415449461e-06, "loss": 0.0073, "reward": 1.961093783378601, "reward_std": 0.9713665246963501, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4767187535762787, "step": 2723 }, { "completion_length": 121.40625, "epoch": 1.4574638844301766, "grad_norm": 1.1697957515716553, "kl": 0.15986216068267822, "learning_rate": 3.05565197825861e-06, "loss": 0.0064, "reward": 2.546875, "reward_std": 0.22097086906433105, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 2724 }, { "completion_length": 109.125, "epoch": 1.4579989299090423, "grad_norm": 1.1722640991210938, "kl": 0.3218252658843994, "learning_rate": 3.054134325686406e-06, "loss": 0.0129, "reward": 2.362968921661377, "reward_std": 0.7871748208999634, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4879687428474426, "step": 2725 }, { "completion_length": 117.4375, "epoch": 1.4585339753879079, "grad_norm": 0.7722155451774597, "kl": 0.16420623660087585, "learning_rate": 3.052616458321118e-06, "loss": 0.0066, "reward": 1.8397188186645508, "reward_std": 0.8736599683761597, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.480343759059906, "step": 2726 }, { "completion_length": 122.125, "epoch": 1.4590690208667736, "grad_norm": 1.1222801208496094, "kl": 0.19758957624435425, "learning_rate": 3.051098376751101e-06, "loss": 0.0079, "reward": 2.34375, "reward_std": 0.7367773056030273, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.5, "step": 2727 }, { "completion_length": 135.09375, "epoch": 1.4596040663456393, "grad_norm": 1.0999467372894287, "kl": 0.18769241869449615, "learning_rate": 3.049580081564793e-06, "loss": 0.0075, "reward": 1.67578125, "reward_std": 0.6089509725570679, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.45703125, "step": 2728 }, { "completion_length": 119.21875, "epoch": 1.460139111824505, "grad_norm": 0.7566143870353699, "kl": 0.29200392961502075, "learning_rate": 3.048061573350714e-06, "loss": 0.0117, "reward": 2.5239062309265137, "reward_std": 0.7462931275367737, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.49265623092651367, "step": 2729 }, { "completion_length": 130.84375, "epoch": 1.4606741573033708, "grad_norm": 2.0071206092834473, "kl": 0.17579737305641174, "learning_rate": 3.046542852697467e-06, "loss": 0.007, "reward": 2.360156297683716, "reward_std": 0.9477616548538208, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46953123807907104, "step": 2730 }, { "completion_length": 120.65625, "epoch": 1.4612092027822365, "grad_norm": 0.464474081993103, "kl": 0.16900044679641724, "learning_rate": 3.0450239201937364e-06, "loss": 0.0068, "reward": 2.606656312942505, "reward_std": 0.5438549518585205, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4816562533378601, "step": 2731 }, { "completion_length": 123.375, "epoch": 1.4617442482611023, "grad_norm": 1.5653977394104004, "kl": 0.2818079888820648, "learning_rate": 3.043504776428291e-06, "loss": 0.0113, "reward": 2.7481250762939453, "reward_std": 0.666313648223877, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48250001668930054, "step": 2732 }, { "completion_length": 118.09375, "epoch": 1.462279293739968, "grad_norm": 1.5391191244125366, "kl": 0.4800640344619751, "learning_rate": 3.0419854219899784e-06, "loss": 0.0192, "reward": 2.890500068664551, "reward_std": 0.2644124925136566, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.499875009059906, "step": 2733 }, { "completion_length": 138.75, "epoch": 1.4628143392188335, "grad_norm": 2.581066370010376, "kl": 0.26831796765327454, "learning_rate": 3.0404658574677317e-06, "loss": 0.0107, "reward": 2.0325000286102295, "reward_std": 1.0119484663009644, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4699999988079071, "step": 2734 }, { "completion_length": 141.96875, "epoch": 1.4633493846976993, "grad_norm": 1.3662123680114746, "kl": 0.19262570142745972, "learning_rate": 3.0389460834505608e-06, "loss": 0.0077, "reward": 1.8459999561309814, "reward_std": 0.8244301080703735, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.47099998593330383, "step": 2735 }, { "completion_length": 139.875, "epoch": 1.463884430176565, "grad_norm": 0.7363409399986267, "kl": 0.14189675450325012, "learning_rate": 3.0374261005275606e-06, "loss": 0.0057, "reward": 2.0044686794281006, "reward_std": 0.8492496013641357, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.45759373903274536, "step": 2736 }, { "completion_length": 145.15625, "epoch": 1.4644194756554307, "grad_norm": 4.064396858215332, "kl": 0.22467201948165894, "learning_rate": 3.035905909287905e-06, "loss": 0.009, "reward": 1.6614375114440918, "reward_std": 0.5555706024169922, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.3801875114440918, "step": 2737 }, { "completion_length": 129.8125, "epoch": 1.4649545211342965, "grad_norm": 9265664.0, "kl": 27447.349609375, "learning_rate": 3.0343855103208504e-06, "loss": 1097.8942, "reward": 2.4372501373291016, "reward_std": 0.5389419794082642, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.499750018119812, "step": 2738 }, { "completion_length": 99.625, "epoch": 1.465489566613162, "grad_norm": 0.4337342381477356, "kl": 0.2081543356180191, "learning_rate": 3.0328649042157316e-06, "loss": 0.0083, "reward": 3.046875, "reward_std": 0.6093180775642395, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 2739 }, { "completion_length": 126.5, "epoch": 1.4660246120920277, "grad_norm": 1.6428918838500977, "kl": 0.26000314950942993, "learning_rate": 3.0313440915619653e-06, "loss": 0.0104, "reward": 2.11328125, "reward_std": 0.9179863333702087, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.48828125, "step": 2740 }, { "completion_length": 112.5, "epoch": 1.4665596575708935, "grad_norm": 4.779312610626221, "kl": 0.577158510684967, "learning_rate": 3.0298230729490454e-06, "loss": 0.0231, "reward": 1.6698750257492065, "reward_std": 0.5941526293754578, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.49799999594688416, "step": 2741 }, { "completion_length": 121.46875, "epoch": 1.4670947030497592, "grad_norm": 0.8200541138648987, "kl": 0.15572255849838257, "learning_rate": 3.0283018489665512e-06, "loss": 0.0062, "reward": 2.09765625, "reward_std": 0.8083906173706055, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48828125, "step": 2742 }, { "completion_length": 118.0, "epoch": 1.467629748528625, "grad_norm": 1.6480684280395508, "kl": 0.1989297866821289, "learning_rate": 3.0267804202041363e-06, "loss": 0.008, "reward": 2.90625, "reward_std": 0.937757134437561, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.5, "step": 2743 }, { "completion_length": 164.28125, "epoch": 1.4681647940074907, "grad_norm": 2.439995765686035, "kl": 0.17323073744773865, "learning_rate": 3.025258787251536e-06, "loss": 0.0069, "reward": 1.507312536239624, "reward_std": 0.7851555943489075, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.39793750643730164, "step": 2744 }, { "completion_length": 119.6875, "epoch": 1.4686998394863564, "grad_norm": 0.9675682783126831, "kl": 0.17726312577724457, "learning_rate": 3.023736950698565e-06, "loss": 0.0071, "reward": 1.685093641281128, "reward_std": 0.8130400776863098, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4819687604904175, "step": 2745 }, { "completion_length": 135.96875, "epoch": 1.4692348849652221, "grad_norm": 41.622928619384766, "kl": 3.4470131397247314, "learning_rate": 3.0222149111351145e-06, "loss": 0.1379, "reward": 1.5155625343322754, "reward_std": 0.7643975019454956, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.453062504529953, "step": 2746 }, { "completion_length": 118.03125, "epoch": 1.4697699304440879, "grad_norm": 0.9084120392799377, "kl": 0.22429096698760986, "learning_rate": 3.020692669151158e-06, "loss": 0.009, "reward": 2.046875, "reward_std": 1.0027047395706177, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.5, "step": 2747 }, { "completion_length": 134.0625, "epoch": 1.4703049759229534, "grad_norm": 0.946853518486023, "kl": 0.17223112285137177, "learning_rate": 3.0191702253367457e-06, "loss": 0.0069, "reward": 2.578125, "reward_std": 0.5459640622138977, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.453125, "step": 2748 }, { "completion_length": 122.21875, "epoch": 1.4708400214018191, "grad_norm": 1.2130458354949951, "kl": 0.18929284811019897, "learning_rate": 3.0176475802820043e-06, "loss": 0.0076, "reward": 2.370500087738037, "reward_std": 0.6651470065116882, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.49549999833106995, "step": 2749 }, { "completion_length": 103.34375, "epoch": 1.4713750668806849, "grad_norm": 0.49906250834465027, "kl": 0.22839705646038055, "learning_rate": 3.0161247345771424e-06, "loss": 0.0091, "reward": 1.95703125, "reward_std": 0.12153397500514984, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.48828125, "step": 2750 }, { "completion_length": 129.03125, "epoch": 1.4719101123595506, "grad_norm": 2.959152936935425, "kl": 0.1666240394115448, "learning_rate": 3.0146016888124423e-06, "loss": 0.0067, "reward": 2.5625, "reward_std": 0.7517592906951904, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.5, "step": 2751 }, { "completion_length": 127.59375, "epoch": 1.4724451578384163, "grad_norm": 1.6308708190917969, "kl": 0.2208322435617447, "learning_rate": 3.0130784435782667e-06, "loss": 0.0088, "reward": 1.8270001411437988, "reward_std": 0.8195948600769043, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4832499921321869, "step": 2752 }, { "completion_length": 105.09375, "epoch": 1.4729802033172819, "grad_norm": 1.0831477642059326, "kl": 0.19357892870903015, "learning_rate": 3.0115549994650546e-06, "loss": 0.0077, "reward": 2.4605937004089355, "reward_std": 0.8127700090408325, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4762187600135803, "step": 2753 }, { "completion_length": 142.09375, "epoch": 1.4735152487961476, "grad_norm": 1.7718969583511353, "kl": 0.17033520340919495, "learning_rate": 3.010031357063322e-06, "loss": 0.0068, "reward": 2.2179064750671387, "reward_std": 1.023500680923462, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.48353126645088196, "step": 2754 }, { "completion_length": 151.0, "epoch": 1.4740502942750133, "grad_norm": 0.569105863571167, "kl": 0.13979405164718628, "learning_rate": 3.008507516963662e-06, "loss": 0.0056, "reward": 1.5414375066757202, "reward_std": 0.726677417755127, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4633125066757202, "step": 2755 }, { "completion_length": 117.40625, "epoch": 1.474585339753879, "grad_norm": 0.886481523513794, "kl": 0.19906477630138397, "learning_rate": 3.006983479756743e-06, "loss": 0.008, "reward": 2.0756874084472656, "reward_std": 0.5944581031799316, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4819374978542328, "step": 2756 }, { "completion_length": 114.125, "epoch": 1.4751203852327448, "grad_norm": 0.8023622632026672, "kl": 0.22340191900730133, "learning_rate": 3.005459246033314e-06, "loss": 0.0089, "reward": 2.8163750171661377, "reward_std": 1.019089698791504, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4882500171661377, "step": 2757 }, { "completion_length": 118.46875, "epoch": 1.4756554307116105, "grad_norm": 0.7042677998542786, "kl": 0.23577600717544556, "learning_rate": 3.003934816384194e-06, "loss": 0.0094, "reward": 2.4720001220703125, "reward_std": 0.7651689648628235, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48762500286102295, "step": 2758 }, { "completion_length": 131.90625, "epoch": 1.4761904761904763, "grad_norm": 46.862449645996094, "kl": 0.8305417895317078, "learning_rate": 3.002410191400283e-06, "loss": 0.0332, "reward": 2.5536563396453857, "reward_std": 1.084414005279541, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4599062502384186, "step": 2759 }, { "completion_length": 122.75, "epoch": 1.476725521669342, "grad_norm": 0.5660932064056396, "kl": 0.18413738906383514, "learning_rate": 3.000885371672554e-06, "loss": 0.0074, "reward": 2.3043124675750732, "reward_std": 0.5456851720809937, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47618749737739563, "step": 2760 }, { "completion_length": 137.8125, "epoch": 1.4772605671482075, "grad_norm": 0.6012275815010071, "kl": 0.18999522924423218, "learning_rate": 2.9993603577920564e-06, "loss": 0.0076, "reward": 2.0380001068115234, "reward_std": 0.9233769178390503, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4598749876022339, "step": 2761 }, { "completion_length": 130.625, "epoch": 1.4777956126270733, "grad_norm": 0.8476559519767761, "kl": 0.16884511709213257, "learning_rate": 2.997835150349916e-06, "loss": 0.0068, "reward": 2.0038437843322754, "reward_std": 0.5720291137695312, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.425718754529953, "step": 2762 }, { "completion_length": 133.46875, "epoch": 1.478330658105939, "grad_norm": 1.0815157890319824, "kl": 0.1805761605501175, "learning_rate": 2.996309749937331e-06, "loss": 0.0072, "reward": 1.5546875, "reward_std": 0.6918982267379761, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4765625, "step": 2763 }, { "completion_length": 122.0, "epoch": 1.4788657035848047, "grad_norm": 0.6397398710250854, "kl": 0.1715618371963501, "learning_rate": 2.994784157145576e-06, "loss": 0.0069, "reward": 2.2890625, "reward_std": 0.7864118218421936, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4921875, "step": 2764 }, { "completion_length": 139.3125, "epoch": 1.4794007490636705, "grad_norm": 2.1593542098999023, "kl": 0.25242453813552856, "learning_rate": 2.993258372566e-06, "loss": 0.0101, "reward": 1.9824687242507935, "reward_std": 0.9065797924995422, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46684375405311584, "step": 2765 }, { "completion_length": 110.15625, "epoch": 1.479935794542536, "grad_norm": 0.9766698479652405, "kl": 0.25234362483024597, "learning_rate": 2.991732396790028e-06, "loss": 0.0101, "reward": 2.4094061851501465, "reward_std": 0.6827230453491211, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.48753124475479126, "step": 2766 }, { "completion_length": 122.34375, "epoch": 1.4804708400214017, "grad_norm": 1.0163213014602661, "kl": 0.16847875714302063, "learning_rate": 2.990206230409155e-06, "loss": 0.0067, "reward": 2.899562358856201, "reward_std": 0.7127662301063538, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4933125078678131, "step": 2767 }, { "completion_length": 136.8125, "epoch": 1.4810058855002675, "grad_norm": 2.36004638671875, "kl": 0.14874617755413055, "learning_rate": 2.9886798740149543e-06, "loss": 0.0059, "reward": 2.050687551498413, "reward_std": 1.023338794708252, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4725624918937683, "step": 2768 }, { "completion_length": 118.46875, "epoch": 1.4815409309791332, "grad_norm": 2.5046887397766113, "kl": 0.2678666114807129, "learning_rate": 2.9871533281990693e-06, "loss": 0.0107, "reward": 2.1342499256134033, "reward_std": 1.047275424003601, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4623749852180481, "step": 2769 }, { "completion_length": 128.28125, "epoch": 1.482075976457999, "grad_norm": 8.791450500488281, "kl": 0.16528944671154022, "learning_rate": 2.9856265935532193e-06, "loss": 0.0066, "reward": 2.203125, "reward_std": 0.7668987512588501, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.5, "step": 2770 }, { "completion_length": 124.375, "epoch": 1.4826110219368647, "grad_norm": 5.745534420013428, "kl": 0.5672129988670349, "learning_rate": 2.984099670669196e-06, "loss": 0.0227, "reward": 1.875, "reward_std": 0.8596675992012024, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.453125, "step": 2771 }, { "completion_length": 146.65625, "epoch": 1.4831460674157304, "grad_norm": 1.5440860986709595, "kl": 0.20138898491859436, "learning_rate": 2.982572560138864e-06, "loss": 0.0081, "reward": 1.4019999504089355, "reward_std": 0.5292752981185913, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.43324998021125793, "step": 2772 }, { "completion_length": 141.875, "epoch": 1.4836811128945961, "grad_norm": 1.1534184217453003, "kl": 0.20729726552963257, "learning_rate": 2.98104526255416e-06, "loss": 0.0083, "reward": 1.273781180381775, "reward_std": 0.5829790830612183, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4456562399864197, "step": 2773 }, { "completion_length": 126.65625, "epoch": 1.4842161583734619, "grad_norm": 1.557094931602478, "kl": 0.35694169998168945, "learning_rate": 2.979517778507094e-06, "loss": 0.0143, "reward": 2.5679373741149902, "reward_std": 0.3940723240375519, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4741874933242798, "step": 2774 }, { "completion_length": 144.0625, "epoch": 1.4847512038523274, "grad_norm": 626.5067749023438, "kl": 1.3829553127288818, "learning_rate": 2.9779901085897478e-06, "loss": 0.0553, "reward": 1.4373750686645508, "reward_std": 0.791730523109436, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.453000009059906, "step": 2775 }, { "completion_length": 148.1875, "epoch": 1.4852862493311931, "grad_norm": 0.9375156760215759, "kl": 0.18542945384979248, "learning_rate": 2.9764622533942773e-06, "loss": 0.0074, "reward": 1.8895312547683716, "reward_std": 0.9923408031463623, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4364062547683716, "step": 2776 }, { "completion_length": 144.46875, "epoch": 1.4858212948100589, "grad_norm": 0.7911767363548279, "kl": 0.21166691184043884, "learning_rate": 2.974934213512907e-06, "loss": 0.0085, "reward": 1.7931874990463257, "reward_std": 0.6561228036880493, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4650624990463257, "step": 2777 }, { "completion_length": 152.90625, "epoch": 1.4863563402889246, "grad_norm": 0.45843735337257385, "kl": 0.14501112699508667, "learning_rate": 2.973405989537933e-06, "loss": 0.0058, "reward": 1.4523437023162842, "reward_std": 0.5752780437469482, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.45234376192092896, "step": 2778 }, { "completion_length": 137.1875, "epoch": 1.4868913857677903, "grad_norm": 14949.6171875, "kl": 629.412841796875, "learning_rate": 2.9718775820617268e-06, "loss": 25.1765, "reward": 1.4160000085830688, "reward_std": 0.7494646310806274, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.40037500858306885, "step": 2779 }, { "completion_length": 136.03125, "epoch": 1.4874264312466559, "grad_norm": 1706101.625, "kl": 515839.75, "learning_rate": 2.9703489916767273e-06, "loss": 20633.5879, "reward": 1.94140625, "reward_std": 0.6442505717277527, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48828125, "step": 2780 }, { "completion_length": 133.0, "epoch": 1.4879614767255216, "grad_norm": 1.1644021272659302, "kl": 0.157181516289711, "learning_rate": 2.9688202189754454e-06, "loss": 0.0063, "reward": 2.3445000648498535, "reward_std": 0.998527467250824, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.45387500524520874, "step": 2781 }, { "completion_length": 111.03125, "epoch": 1.4884965222043873, "grad_norm": 1.6192171573638916, "kl": 0.2636798918247223, "learning_rate": 2.967291264550463e-06, "loss": 0.0105, "reward": 2.530937671661377, "reward_std": 0.8228346705436707, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4684374928474426, "step": 2782 }, { "completion_length": 159.1875, "epoch": 1.489031567683253, "grad_norm": 1.1475106477737427, "kl": 0.21298089623451233, "learning_rate": 2.9657621289944322e-06, "loss": 0.0085, "reward": 1.05287504196167, "reward_std": 0.48967236280441284, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.42787498235702515, "step": 2783 }, { "completion_length": 157.5, "epoch": 1.4895666131621188, "grad_norm": 8.127609252929688, "kl": 0.1917535960674286, "learning_rate": 2.964232812900074e-06, "loss": 0.0077, "reward": 1.3779687881469727, "reward_std": 0.8431963324546814, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.42484375834465027, "step": 2784 }, { "completion_length": 127.28125, "epoch": 1.4901016586409845, "grad_norm": 1.7627164125442505, "kl": 0.22478283941745758, "learning_rate": 2.9627033168601825e-06, "loss": 0.009, "reward": 1.6999374628067017, "reward_std": 0.648408055305481, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46556252241134644, "step": 2785 }, { "completion_length": 122.65625, "epoch": 1.4906367041198503, "grad_norm": 1.6123881340026855, "kl": 0.22207456827163696, "learning_rate": 2.961173641467619e-06, "loss": 0.0089, "reward": 1.820812463760376, "reward_std": 0.6893752217292786, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46143752336502075, "step": 2786 }, { "completion_length": 127.40625, "epoch": 1.491171749598716, "grad_norm": 0.7556410431861877, "kl": 0.23349341750144958, "learning_rate": 2.959643787315314e-06, "loss": 0.0093, "reward": 2.200124979019165, "reward_std": 0.713494598865509, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4657500088214874, "step": 2787 }, { "completion_length": 143.90625, "epoch": 1.4917067950775815, "grad_norm": 0.958646297454834, "kl": 0.16964948177337646, "learning_rate": 2.95811375499627e-06, "loss": 0.0068, "reward": 2.1897811889648438, "reward_std": 1.03711998462677, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4241562485694885, "step": 2788 }, { "completion_length": 138.53125, "epoch": 1.4922418405564473, "grad_norm": 592824.875, "kl": 13720.732421875, "learning_rate": 2.9565835451035546e-06, "loss": 548.8293, "reward": 2.234375, "reward_std": 0.9457334280014038, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.5, "step": 2789 }, { "completion_length": 111.625, "epoch": 1.492776886035313, "grad_norm": 4988346.5, "kl": 478687.1875, "learning_rate": 2.9550531582303083e-06, "loss": 19147.4863, "reward": 2.53515625, "reward_std": 0.6518573760986328, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48828125, "step": 2790 }, { "completion_length": 113.84375, "epoch": 1.4933119315141787, "grad_norm": 2.980736494064331, "kl": 0.2548607289791107, "learning_rate": 2.953522594969738e-06, "loss": 0.0102, "reward": 1.538406252861023, "reward_std": 0.8247779607772827, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46028125286102295, "step": 2791 }, { "completion_length": 129.59375, "epoch": 1.4938469769930445, "grad_norm": 1.9021514654159546, "kl": 0.28759902715682983, "learning_rate": 2.9519918559151196e-06, "loss": 0.0115, "reward": 2.5008437633514404, "reward_std": 0.4627074599266052, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.46959376335144043, "step": 2792 }, { "completion_length": 112.625, "epoch": 1.49438202247191, "grad_norm": 1.3428171873092651, "kl": 0.22313812375068665, "learning_rate": 2.9504609416597955e-06, "loss": 0.0089, "reward": 2.534343719482422, "reward_std": 1.0862761735916138, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48746874928474426, "step": 2793 }, { "completion_length": 138.3125, "epoch": 1.4949170679507757, "grad_norm": 2.039231300354004, "kl": 0.1519438624382019, "learning_rate": 2.9489298527971793e-06, "loss": 0.0061, "reward": 1.6340937614440918, "reward_std": 0.8638986349105835, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4465937614440918, "step": 2794 }, { "completion_length": 115.125, "epoch": 1.4954521134296415, "grad_norm": 1.3030146360397339, "kl": 0.23495835065841675, "learning_rate": 2.947398589920749e-06, "loss": 0.0094, "reward": 2.5662498474121094, "reward_std": 0.7566441893577576, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4725000262260437, "step": 2795 }, { "completion_length": 123.84375, "epoch": 1.4959871589085072, "grad_norm": 1.3847332000732422, "kl": 0.20635679364204407, "learning_rate": 2.9458671536240526e-06, "loss": 0.0083, "reward": 2.2629687786102295, "reward_std": 0.7533128261566162, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4660937488079071, "step": 2796 }, { "completion_length": 123.0625, "epoch": 1.496522204387373, "grad_norm": 1.4203479290008545, "kl": 0.17579275369644165, "learning_rate": 2.9443355445007033e-06, "loss": 0.007, "reward": 1.9950000047683716, "reward_std": 0.740540087223053, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4793750047683716, "step": 2797 }, { "completion_length": 138.125, "epoch": 1.4970572498662387, "grad_norm": 2.3732383251190186, "kl": 0.22859255969524384, "learning_rate": 2.9428037631443812e-06, "loss": 0.0091, "reward": 1.886968731880188, "reward_std": 0.9358064532279968, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.465093731880188, "step": 2798 }, { "completion_length": 149.34375, "epoch": 1.4975922953451044, "grad_norm": 0.9683041572570801, "kl": 0.12454868853092194, "learning_rate": 2.9412718101488365e-06, "loss": 0.005, "reward": 1.696874976158142, "reward_std": 0.7742636799812317, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4156250059604645, "step": 2799 }, { "completion_length": 111.46875, "epoch": 1.4981273408239701, "grad_norm": 1.0645791292190552, "kl": 0.21737028658390045, "learning_rate": 2.939739686107882e-06, "loss": 0.0087, "reward": 2.3020312786102295, "reward_std": 0.9942097067832947, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4739062488079071, "step": 2800 }, { "completion_length": 129.21875, "epoch": 1.4986623863028357, "grad_norm": 0.8978448510169983, "kl": 0.2582496702671051, "learning_rate": 2.938207391615398e-06, "loss": 0.0103, "reward": 2.6440000534057617, "reward_std": 0.7669132947921753, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.45649999380111694, "step": 2801 }, { "completion_length": 136.71875, "epoch": 1.4991974317817014, "grad_norm": 0.42149198055267334, "kl": 0.117653027176857, "learning_rate": 2.9366749272653326e-06, "loss": 0.0047, "reward": 1.85337495803833, "reward_std": 0.6863834857940674, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.46275001764297485, "step": 2802 }, { "completion_length": 149.65625, "epoch": 1.4997324772605671, "grad_norm": 1.194693684577942, "kl": 0.21295571327209473, "learning_rate": 2.935142293651697e-06, "loss": 0.0085, "reward": 1.4307812452316284, "reward_std": 0.7101374864578247, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.3995312452316284, "step": 2803 }, { "completion_length": 136.34375, "epoch": 1.5002675227394329, "grad_norm": 2.078660249710083, "kl": 0.1588565558195114, "learning_rate": 2.93360949136857e-06, "loss": 0.0064, "reward": 2.010531187057495, "reward_std": 0.8602072596549988, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4011562466621399, "step": 2804 }, { "completion_length": 142.25, "epoch": 1.5008025682182986, "grad_norm": 0.7283381223678589, "kl": 0.21156853437423706, "learning_rate": 2.9320765210100955e-06, "loss": 0.0085, "reward": 2.390625, "reward_std": 1.005197525024414, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.484375, "step": 2805 }, { "completion_length": 128.90625, "epoch": 1.5013376136971641, "grad_norm": 3.1048946380615234, "kl": 0.25276023149490356, "learning_rate": 2.930543383170481e-06, "loss": 0.0101, "reward": 1.401750087738037, "reward_std": 0.8020657300949097, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.40174999833106995, "step": 2806 }, { "completion_length": 138.9375, "epoch": 1.5018726591760299, "grad_norm": 3597.33740234375, "kl": 2.2179908752441406, "learning_rate": 2.9290100784440017e-06, "loss": 0.0887, "reward": 2.174875020980835, "reward_std": 0.7766355276107788, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4561249911785126, "step": 2807 }, { "completion_length": 123.03125, "epoch": 1.5024077046548956, "grad_norm": 1.5759179592132568, "kl": 0.19319914281368256, "learning_rate": 2.9274766074249944e-06, "loss": 0.0077, "reward": 2.0750937461853027, "reward_std": 1.1392741203308105, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.45009374618530273, "step": 2808 }, { "completion_length": 128.25, "epoch": 1.5029427501337613, "grad_norm": 2.5097339153289795, "kl": 0.4891098141670227, "learning_rate": 2.9259429707078625e-06, "loss": 0.0196, "reward": 1.8146562576293945, "reward_std": 0.9163030982017517, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.45528125762939453, "step": 2809 }, { "completion_length": 119.3125, "epoch": 1.503477795612627, "grad_norm": 22.333335876464844, "kl": 1.0124517679214478, "learning_rate": 2.9244091688870736e-06, "loss": 0.0405, "reward": 2.398312568664551, "reward_std": 1.1247996091842651, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.460812509059906, "step": 2810 }, { "completion_length": 132.8125, "epoch": 1.5040128410914928, "grad_norm": 1.0720272064208984, "kl": 0.2392028570175171, "learning_rate": 2.9228752025571576e-06, "loss": 0.0096, "reward": 1.630968689918518, "reward_std": 0.7280064821243286, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.45909374952316284, "step": 2811 }, { "completion_length": 149.75, "epoch": 1.5045478865703585, "grad_norm": 3.016479015350342, "kl": 0.16659823060035706, "learning_rate": 2.9213410723127095e-06, "loss": 0.0067, "reward": 1.1156562566757202, "reward_std": 0.5876269936561584, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.3656562566757202, "step": 2812 }, { "completion_length": 145.625, "epoch": 1.5050829320492243, "grad_norm": 0.4489293694496155, "kl": 0.1308424174785614, "learning_rate": 2.919806778748388e-06, "loss": 0.0052, "reward": 1.5255625247955322, "reward_std": 0.5097245573997498, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.46306249499320984, "step": 2813 }, { "completion_length": 138.59375, "epoch": 1.50561797752809, "grad_norm": 1.7975332736968994, "kl": 0.19429805874824524, "learning_rate": 2.9182723224589142e-06, "loss": 0.0078, "reward": 1.6054061651229858, "reward_std": 0.8763082027435303, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.433531254529953, "step": 2814 }, { "completion_length": 144.0625, "epoch": 1.5061530230069557, "grad_norm": 1.877580165863037, "kl": 0.2097052037715912, "learning_rate": 2.916737704039073e-06, "loss": 0.0084, "reward": 1.5800312757492065, "reward_std": 0.8248507976531982, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.43940627574920654, "step": 2815 }, { "completion_length": 126.40625, "epoch": 1.5066880684858213, "grad_norm": 1.958600640296936, "kl": 0.18624496459960938, "learning_rate": 2.915202924083713e-06, "loss": 0.0074, "reward": 2.7591562271118164, "reward_std": 0.8632352352142334, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4779062569141388, "step": 2816 }, { "completion_length": 138.21875, "epoch": 1.507223113964687, "grad_norm": 0.7823576927185059, "kl": 0.1880708634853363, "learning_rate": 2.9136679831877425e-06, "loss": 0.0075, "reward": 2.167562484741211, "reward_std": 0.4279491901397705, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4488125145435333, "step": 2817 }, { "completion_length": 156.3125, "epoch": 1.5077581594435527, "grad_norm": 3.360062599182129, "kl": 0.13906031847000122, "learning_rate": 2.9121328819461343e-06, "loss": 0.0056, "reward": 1.8041250705718994, "reward_std": 0.9954816102981567, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.39787501096725464, "step": 2818 }, { "completion_length": 111.90625, "epoch": 1.5082932049224183, "grad_norm": 0.986932635307312, "kl": 0.1832042634487152, "learning_rate": 2.910597620953924e-06, "loss": 0.0073, "reward": 2.6426875591278076, "reward_std": 0.787361741065979, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48643749952316284, "step": 2819 }, { "completion_length": 126.6875, "epoch": 1.508828250401284, "grad_norm": 0.8110431432723999, "kl": 0.22448556125164032, "learning_rate": 2.909062200806208e-06, "loss": 0.009, "reward": 1.6897813081741333, "reward_std": 0.5702298879623413, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4866562485694885, "step": 2820 }, { "completion_length": 139.84375, "epoch": 1.5093632958801497, "grad_norm": 1.1691408157348633, "kl": 0.2705492377281189, "learning_rate": 2.9075266220981435e-06, "loss": 0.0108, "reward": 1.5701875686645508, "reward_std": 0.7216413021087646, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.445187509059906, "step": 2821 }, { "completion_length": 151.75, "epoch": 1.5098983413590155, "grad_norm": 0.5798828601837158, "kl": 0.15738169848918915, "learning_rate": 2.9059908854249514e-06, "loss": 0.0063, "reward": 1.3632500171661377, "reward_std": 0.5993263721466064, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4257500171661377, "step": 2822 }, { "completion_length": 136.09375, "epoch": 1.5104333868378812, "grad_norm": 0.9279030561447144, "kl": 0.1802811473608017, "learning_rate": 2.9044549913819125e-06, "loss": 0.0072, "reward": 1.6724061965942383, "reward_std": 0.8575655221939087, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.43803125619888306, "step": 2823 }, { "completion_length": 127.9375, "epoch": 1.510968432316747, "grad_norm": 1.5261056423187256, "kl": 0.2822427451610565, "learning_rate": 2.9029189405643678e-06, "loss": 0.0113, "reward": 2.2003750801086426, "reward_std": 0.9731872081756592, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4816249907016754, "step": 2824 }, { "completion_length": 126.21875, "epoch": 1.5115034777956127, "grad_norm": 0.5208948850631714, "kl": 0.1785099357366562, "learning_rate": 2.9013827335677215e-06, "loss": 0.0071, "reward": 2.1640625, "reward_std": 0.8100846409797668, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4921875, "step": 2825 }, { "completion_length": 146.625, "epoch": 1.5120385232744784, "grad_norm": 0.5938888192176819, "kl": 0.17195087671279907, "learning_rate": 2.899846370987436e-06, "loss": 0.0069, "reward": 2.0957813262939453, "reward_std": 0.853634238243103, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.45515623688697815, "step": 2826 }, { "completion_length": 118.46875, "epoch": 1.5125735687533441, "grad_norm": 0.7716425657272339, "kl": 0.21205450594425201, "learning_rate": 2.8983098534190347e-06, "loss": 0.0085, "reward": 2.2435624599456787, "reward_std": 0.8304901719093323, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4935624897480011, "step": 2827 }, { "completion_length": 130.59375, "epoch": 1.5131086142322099, "grad_norm": 0.9580174684524536, "kl": 0.1923292577266693, "learning_rate": 2.8967731814581016e-06, "loss": 0.0077, "reward": 2.2896249294281006, "reward_std": 0.564346432685852, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47712498903274536, "step": 2828 }, { "completion_length": 154.1875, "epoch": 1.5136436597110754, "grad_norm": 26.052284240722656, "kl": 1.1058355569839478, "learning_rate": 2.89523635570028e-06, "loss": 0.0442, "reward": 1.876562476158142, "reward_std": 1.2796788215637207, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.4078125059604645, "step": 2829 }, { "completion_length": 127.125, "epoch": 1.5141787051899411, "grad_norm": 0.8850816488265991, "kl": 0.1919088363647461, "learning_rate": 2.8936993767412743e-06, "loss": 0.0077, "reward": 2.6147499084472656, "reward_std": 0.9652235507965088, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4897499978542328, "step": 2830 }, { "completion_length": 112.15625, "epoch": 1.5147137506688069, "grad_norm": 0.8642232418060303, "kl": 0.2588742971420288, "learning_rate": 2.8921622451768446e-06, "loss": 0.0104, "reward": 2.214531183242798, "reward_std": 1.0147173404693604, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4801562428474426, "step": 2831 }, { "completion_length": 135.96875, "epoch": 1.5152487961476726, "grad_norm": 0.9095354080200195, "kl": 0.210424542427063, "learning_rate": 2.8906249616028136e-06, "loss": 0.0084, "reward": 1.921875, "reward_std": 0.9029046297073364, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.453125, "step": 2832 }, { "completion_length": 122.75, "epoch": 1.5157838416265381, "grad_norm": 1.015858769416809, "kl": 0.23538681864738464, "learning_rate": 2.889087526615063e-06, "loss": 0.0094, "reward": 2.189406394958496, "reward_std": 0.6996535062789917, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47065624594688416, "step": 2833 }, { "completion_length": 140.5625, "epoch": 1.5163188871054039, "grad_norm": 0.5670363306999207, "kl": 0.1460399031639099, "learning_rate": 2.8875499408095316e-06, "loss": 0.0058, "reward": 2.0290000438690186, "reward_std": 0.5175659656524658, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4821249842643738, "step": 2834 }, { "completion_length": 113.53125, "epoch": 1.5168539325842696, "grad_norm": 5.688172817230225, "kl": 0.44500458240509033, "learning_rate": 2.8860122047822152e-06, "loss": 0.0178, "reward": 2.118562698364258, "reward_std": 0.9862138628959656, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4466875195503235, "step": 2835 }, { "completion_length": 124.5, "epoch": 1.5173889780631353, "grad_norm": 2012.082275390625, "kl": 48.993072509765625, "learning_rate": 2.884474319129172e-06, "loss": 1.9597, "reward": 1.866499900817871, "reward_std": 0.7302798628807068, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46024999022483826, "step": 2836 }, { "completion_length": 122.65625, "epoch": 1.517924023542001, "grad_norm": 1.7561320066452026, "kl": 0.2054012566804886, "learning_rate": 2.8829362844465154e-06, "loss": 0.0082, "reward": 2.380031108856201, "reward_std": 0.9412440061569214, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4425312280654907, "step": 2837 }, { "completion_length": 112.28125, "epoch": 1.5184590690208668, "grad_norm": 2.451258420944214, "kl": 0.21100592613220215, "learning_rate": 2.8813981013304164e-06, "loss": 0.0084, "reward": 2.1714999675750732, "reward_std": 0.9850468635559082, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46837499737739563, "step": 2838 }, { "completion_length": 134.0625, "epoch": 1.5189941144997325, "grad_norm": 36749.49609375, "kl": 9246.1533203125, "learning_rate": 2.879859770377105e-06, "loss": 369.8462, "reward": 1.264312505722046, "reward_std": 0.4699368476867676, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4830625057220459, "step": 2839 }, { "completion_length": 131.34375, "epoch": 1.5195291599785983, "grad_norm": 0.932751476764679, "kl": 0.1699632704257965, "learning_rate": 2.8783212921828674e-06, "loss": 0.0068, "reward": 2.216437578201294, "reward_std": 1.084227442741394, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4820624887943268, "step": 2840 }, { "completion_length": 125.21875, "epoch": 1.520064205457464, "grad_norm": 2.858569860458374, "kl": 0.38199737668037415, "learning_rate": 2.876782667344047e-06, "loss": 0.0153, "reward": 2.276625156402588, "reward_std": 1.0175037384033203, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.49537500739097595, "step": 2841 }, { "completion_length": 122.875, "epoch": 1.5205992509363297, "grad_norm": 1.9433835744857788, "kl": 0.2342970073223114, "learning_rate": 2.8752438964570454e-06, "loss": 0.0094, "reward": 2.1699061393737793, "reward_std": 0.5585759878158569, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48240625858306885, "step": 2842 }, { "completion_length": 154.71875, "epoch": 1.5211342964151953, "grad_norm": 0.7037032842636108, "kl": 0.17482304573059082, "learning_rate": 2.8737049801183186e-06, "loss": 0.007, "reward": 1.76953125, "reward_std": 0.8338561058044434, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.45703125, "step": 2843 }, { "completion_length": 122.875, "epoch": 1.521669341894061, "grad_norm": 1.819383978843689, "kl": 0.20456743240356445, "learning_rate": 2.8721659189243802e-06, "loss": 0.0082, "reward": 2.023937463760376, "reward_std": 0.9091111421585083, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.47706252336502075, "step": 2844 }, { "completion_length": 141.875, "epoch": 1.5222043873729267, "grad_norm": 2.0318944454193115, "kl": 0.14704447984695435, "learning_rate": 2.8706267134718e-06, "loss": 0.0059, "reward": 2.1962499618530273, "reward_std": 1.0448691844940186, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4618750214576721, "step": 2845 }, { "completion_length": 142.25, "epoch": 1.5227394328517923, "grad_norm": 1.3617842197418213, "kl": 0.3222222626209259, "learning_rate": 2.8690873643572036e-06, "loss": 0.0129, "reward": 1.8246874809265137, "reward_std": 0.8354192972183228, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.40281251072883606, "step": 2846 }, { "completion_length": 112.71875, "epoch": 1.523274478330658, "grad_norm": 1.4345216751098633, "kl": 0.16801851987838745, "learning_rate": 2.8675478721772714e-06, "loss": 0.0067, "reward": 3.1338748931884766, "reward_std": 0.8404861688613892, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4620000123977661, "step": 2847 }, { "completion_length": 159.625, "epoch": 1.5238095238095237, "grad_norm": 0.5702925324440002, "kl": 0.10928720235824585, "learning_rate": 2.866008237528742e-06, "loss": 0.0044, "reward": 1.2414062023162842, "reward_std": 0.7909468412399292, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.36640626192092896, "step": 2848 }, { "completion_length": 117.3125, "epoch": 1.5243445692883895, "grad_norm": 1.6615955829620361, "kl": 0.30235379934310913, "learning_rate": 2.8644684610084052e-06, "loss": 0.0121, "reward": 2.1357500553131104, "reward_std": 0.5884919762611389, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46387502551078796, "step": 2849 }, { "completion_length": 116.28125, "epoch": 1.5248796147672552, "grad_norm": 1.1682368516921997, "kl": 0.2559334933757782, "learning_rate": 2.862928543213108e-06, "loss": 0.0102, "reward": 2.7808749675750732, "reward_std": 0.9738439321517944, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.49962499737739563, "step": 2850 }, { "completion_length": 135.96875, "epoch": 1.525414660246121, "grad_norm": 18.185802459716797, "kl": 0.2559334337711334, "learning_rate": 2.8613884847397545e-06, "loss": 0.0102, "reward": 2.711343765258789, "reward_std": 0.5722464323043823, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.49259376525878906, "step": 2851 }, { "completion_length": 121.15625, "epoch": 1.5259497057249867, "grad_norm": 1.9867019653320312, "kl": 0.16541996598243713, "learning_rate": 2.859848286185298e-06, "loss": 0.0066, "reward": 2.2968125343322754, "reward_std": 0.5982604026794434, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.499937504529953, "step": 2852 }, { "completion_length": 106.0625, "epoch": 1.5264847512038524, "grad_norm": 1.0073187351226807, "kl": 0.19459345936775208, "learning_rate": 2.858307948146751e-06, "loss": 0.0078, "reward": 2.375, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 2853 }, { "completion_length": 115.5, "epoch": 1.5270197966827181, "grad_norm": 1.2429803609848022, "kl": 0.2610155940055847, "learning_rate": 2.856767471221177e-06, "loss": 0.0104, "reward": 2.609375, "reward_std": 0.6756467223167419, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46875, "step": 2854 }, { "completion_length": 112.5625, "epoch": 1.5275548421615839, "grad_norm": 1.2875025272369385, "kl": 0.2635664939880371, "learning_rate": 2.8552268560056936e-06, "loss": 0.0105, "reward": 2.4838123321533203, "reward_std": 0.8594045639038086, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48381251096725464, "step": 2855 }, { "completion_length": 103.4375, "epoch": 1.5280898876404494, "grad_norm": 2.761746883392334, "kl": 0.47831475734710693, "learning_rate": 2.8536861030974744e-06, "loss": 0.0191, "reward": 1.9387186765670776, "reward_std": 0.4950001537799835, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4699687361717224, "step": 2856 }, { "completion_length": 124.0, "epoch": 1.5286249331193151, "grad_norm": 2.5366203784942627, "kl": 0.17868222296237946, "learning_rate": 2.8521452130937443e-06, "loss": 0.0071, "reward": 2.100749969482422, "reward_std": 0.6663400530815125, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46012499928474426, "step": 2857 }, { "completion_length": 137.125, "epoch": 1.5291599785981809, "grad_norm": 1.1010187864303589, "kl": 0.17411601543426514, "learning_rate": 2.8506041865917813e-06, "loss": 0.007, "reward": 1.261062502861023, "reward_std": 0.5422983169555664, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.41731250286102295, "step": 2858 }, { "completion_length": 143.96875, "epoch": 1.5296950240770464, "grad_norm": 23.324357986450195, "kl": 0.57642662525177, "learning_rate": 2.8490630241889172e-06, "loss": 0.0231, "reward": 1.8530313968658447, "reward_std": 0.8309878706932068, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4467812478542328, "step": 2859 }, { "completion_length": 106.03125, "epoch": 1.5302300695559121, "grad_norm": 1.3605897426605225, "kl": 0.18237635493278503, "learning_rate": 2.8475217264825357e-06, "loss": 0.0073, "reward": 1.8448437452316284, "reward_std": 0.3971787691116333, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4854687452316284, "step": 2860 }, { "completion_length": 132.21875, "epoch": 1.5307651150347779, "grad_norm": 2.2857308387756348, "kl": 0.19817303121089935, "learning_rate": 2.8459802940700736e-06, "loss": 0.0079, "reward": 2.4452500343322754, "reward_std": 0.7799544334411621, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.492125004529953, "step": 2861 }, { "completion_length": 159.625, "epoch": 1.5313001605136436, "grad_norm": 0.914454996585846, "kl": 0.14342030882835388, "learning_rate": 2.84443872754902e-06, "loss": 0.0057, "reward": 1.8664062023162842, "reward_std": 0.7756016850471497, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.44453126192092896, "step": 2862 }, { "completion_length": 136.4375, "epoch": 1.5318352059925093, "grad_norm": 0.6905821561813354, "kl": 0.14889606833457947, "learning_rate": 2.842897027516915e-06, "loss": 0.006, "reward": 1.9609375, "reward_std": 1.012934923171997, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4765625, "step": 2863 }, { "completion_length": 145.09375, "epoch": 1.532370251471375, "grad_norm": 26.54477882385254, "kl": 6.447727680206299, "learning_rate": 2.841355194571351e-06, "loss": 0.2579, "reward": 1.9599686861038208, "reward_std": 1.2847235202789307, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4130937457084656, "step": 2864 }, { "completion_length": 124.4375, "epoch": 1.5329052969502408, "grad_norm": 1.4063081741333008, "kl": 0.16029764711856842, "learning_rate": 2.8398132293099733e-06, "loss": 0.0064, "reward": 2.375, "reward_std": 0.5455899834632874, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.5, "step": 2865 }, { "completion_length": 128.46875, "epoch": 1.5334403424291065, "grad_norm": 0.9098549485206604, "kl": 0.1766059547662735, "learning_rate": 2.8382711323304753e-06, "loss": 0.0071, "reward": 1.7989063262939453, "reward_std": 0.792506217956543, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47078123688697815, "step": 2866 }, { "completion_length": 141.84375, "epoch": 1.5339753879079723, "grad_norm": 1.7440128326416016, "kl": 0.22029483318328857, "learning_rate": 2.8367289042306044e-06, "loss": 0.0088, "reward": 2.0234375, "reward_std": 0.8651784658432007, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4765625, "step": 2867 }, { "completion_length": 126.65625, "epoch": 1.534510433386838, "grad_norm": 0.7980361580848694, "kl": 0.14600861072540283, "learning_rate": 2.8351865456081584e-06, "loss": 0.0058, "reward": 2.0314998626708984, "reward_std": 0.8855797052383423, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4690000116825104, "step": 2868 }, { "completion_length": 125.4375, "epoch": 1.5350454788657037, "grad_norm": 0.8187362551689148, "kl": 0.16308116912841797, "learning_rate": 2.8336440570609837e-06, "loss": 0.0065, "reward": 2.0773749351501465, "reward_std": 0.5422660708427429, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.49924999475479126, "step": 2869 }, { "completion_length": 148.71875, "epoch": 1.5355805243445693, "grad_norm": 0.5246856212615967, "kl": 0.10569380968809128, "learning_rate": 2.8321014391869785e-06, "loss": 0.0042, "reward": 2.198812484741211, "reward_std": 0.8249722123146057, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.46443748474121094, "step": 2870 }, { "completion_length": 104.4375, "epoch": 1.536115569823435, "grad_norm": 2.2963366508483887, "kl": 0.2766697406768799, "learning_rate": 2.830558692584092e-06, "loss": 0.0111, "reward": 2.4568750858306885, "reward_std": 0.26439139246940613, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4725000262260437, "step": 2871 }, { "completion_length": 120.25, "epoch": 1.5366506153023007, "grad_norm": 1.877935528755188, "kl": 0.22896865010261536, "learning_rate": 2.829015817850322e-06, "loss": 0.0092, "reward": 1.796875, "reward_std": 0.7854158282279968, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.453125, "step": 2872 }, { "completion_length": 119.53125, "epoch": 1.5371856607811663, "grad_norm": 2.937316656112671, "kl": 0.24668942391872406, "learning_rate": 2.827472815583717e-06, "loss": 0.0099, "reward": 2.749875068664551, "reward_std": 0.9827315807342529, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.499875009059906, "step": 2873 }, { "completion_length": 147.125, "epoch": 1.537720706260032, "grad_norm": 864.3816528320312, "kl": 4.23674201965332, "learning_rate": 2.8259296863823717e-06, "loss": 0.1695, "reward": 1.4458436965942383, "reward_std": 0.8753246665000916, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.43021875619888306, "step": 2874 }, { "completion_length": 126.5, "epoch": 1.5382557517388977, "grad_norm": 3.0275957584381104, "kl": 0.28963226079940796, "learning_rate": 2.8243864308444352e-06, "loss": 0.0116, "reward": 2.083156108856201, "reward_std": 0.9231857061386108, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.4269062280654907, "step": 2875 }, { "completion_length": 153.34375, "epoch": 1.5387907972177635, "grad_norm": 27.566633224487305, "kl": 5.911110877990723, "learning_rate": 2.822843049568102e-06, "loss": 0.2364, "reward": 1.3273749351501465, "reward_std": 0.5256280303001404, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.43674999475479126, "step": 2876 }, { "completion_length": 126.8125, "epoch": 1.5393258426966292, "grad_norm": 0.7774311304092407, "kl": 0.19551068544387817, "learning_rate": 2.8212995431516165e-06, "loss": 0.0078, "reward": 2.330531120300293, "reward_std": 0.6913439035415649, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4711562395095825, "step": 2877 }, { "completion_length": 125.28125, "epoch": 1.539860888175495, "grad_norm": 0.7495091557502747, "kl": 0.15821701288223267, "learning_rate": 2.8197559121932703e-06, "loss": 0.0063, "reward": 2.1250624656677246, "reward_std": 0.8158438205718994, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4375625252723694, "step": 2878 }, { "completion_length": 135.53125, "epoch": 1.5403959336543607, "grad_norm": 4.431620121002197, "kl": 0.2230129987001419, "learning_rate": 2.8182121572914055e-06, "loss": 0.0089, "reward": 2.3081562519073486, "reward_std": 1.0761901140213013, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48003125190734863, "step": 2879 }, { "completion_length": 144.53125, "epoch": 1.5409309791332264, "grad_norm": 0.6290665864944458, "kl": 0.12122111022472382, "learning_rate": 2.8166682790444115e-06, "loss": 0.0048, "reward": 1.4313124418258667, "reward_std": 0.849581241607666, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4000625014305115, "step": 2880 }, { "completion_length": 137.09375, "epoch": 1.5414660246120921, "grad_norm": 0.3268119990825653, "kl": 0.10898218303918839, "learning_rate": 2.815124278050724e-06, "loss": 0.0044, "reward": 2.447218894958496, "reward_std": 0.5455365777015686, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.41596877574920654, "step": 2881 }, { "completion_length": 96.5625, "epoch": 1.5420010700909579, "grad_norm": 1.2592731714248657, "kl": 0.19874218106269836, "learning_rate": 2.8135801549088277e-06, "loss": 0.0079, "reward": 2.1227188110351562, "reward_std": 0.5324397683143616, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4508437514305115, "step": 2882 }, { "completion_length": 131.09375, "epoch": 1.5425361155698234, "grad_norm": 1.5320613384246826, "kl": 0.24912410974502563, "learning_rate": 2.812035910217254e-06, "loss": 0.01, "reward": 2.4277501106262207, "reward_std": 0.9187932014465332, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.47462499141693115, "step": 2883 }, { "completion_length": 133.90625, "epoch": 1.5430711610486891, "grad_norm": 0.620418131351471, "kl": 0.1385890245437622, "learning_rate": 2.8104915445745817e-06, "loss": 0.0055, "reward": 2.076624870300293, "reward_std": 0.6698155999183655, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4203749895095825, "step": 2884 }, { "completion_length": 131.0, "epoch": 1.5436062065275549, "grad_norm": 1.1505725383758545, "kl": 0.146537184715271, "learning_rate": 2.808947058579438e-06, "loss": 0.0059, "reward": 2.1791563034057617, "reward_std": 0.7068984508514404, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.49165624380111694, "step": 2885 }, { "completion_length": 127.34375, "epoch": 1.5441412520064204, "grad_norm": 1.6797606945037842, "kl": 0.23101231455802917, "learning_rate": 2.8074024528304934e-06, "loss": 0.0092, "reward": 2.387406349182129, "reward_std": 0.8651675581932068, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46553125977516174, "step": 2886 }, { "completion_length": 125.28125, "epoch": 1.5446762974852861, "grad_norm": 146.22341918945312, "kl": 0.8786709904670715, "learning_rate": 2.8058577279264683e-06, "loss": 0.0351, "reward": 2.362593650817871, "reward_std": 0.9100881814956665, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.45634374022483826, "step": 2887 }, { "completion_length": 133.8125, "epoch": 1.5452113429641519, "grad_norm": 1.462598443031311, "kl": 0.3219149112701416, "learning_rate": 2.8043128844661253e-06, "loss": 0.0129, "reward": 1.1498749256134033, "reward_std": 0.537643551826477, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4467499852180481, "step": 2888 }, { "completion_length": 132.8125, "epoch": 1.5457463884430176, "grad_norm": 3.934285879135132, "kl": 0.17898127436637878, "learning_rate": 2.8027679230482778e-06, "loss": 0.0072, "reward": 1.3976874351501465, "reward_std": 0.4386926591396332, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47581249475479126, "step": 2889 }, { "completion_length": 115.8125, "epoch": 1.5462814339218833, "grad_norm": 3.7709226608276367, "kl": 0.1694173365831375, "learning_rate": 2.801222844271782e-06, "loss": 0.0068, "reward": 2.46875, "reward_std": 0.5381704568862915, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.5, "step": 2890 }, { "completion_length": 130.03125, "epoch": 1.546816479400749, "grad_norm": 1.3655989170074463, "kl": 0.17891526222229004, "learning_rate": 2.799677648735539e-06, "loss": 0.0072, "reward": 1.8215625286102295, "reward_std": 0.2794884741306305, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4621874988079071, "step": 2891 }, { "completion_length": 125.9375, "epoch": 1.5473515248796148, "grad_norm": 1.328069806098938, "kl": 0.18365885317325592, "learning_rate": 2.7981323370384966e-06, "loss": 0.0073, "reward": 2.5840001106262207, "reward_std": 0.7153013944625854, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47462499141693115, "step": 2892 }, { "completion_length": 137.96875, "epoch": 1.5478865703584805, "grad_norm": 3.124297618865967, "kl": 0.18062035739421844, "learning_rate": 2.796586909779647e-06, "loss": 0.0072, "reward": 1.8685312271118164, "reward_std": 0.7253150343894958, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4622812569141388, "step": 2893 }, { "completion_length": 145.5625, "epoch": 1.5484216158373463, "grad_norm": 0.8019036054611206, "kl": 0.17163896560668945, "learning_rate": 2.795041367558029e-06, "loss": 0.0069, "reward": 1.6609063148498535, "reward_std": 0.9359814524650574, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.39528122544288635, "step": 2894 }, { "completion_length": 149.65625, "epoch": 1.548956661316212, "grad_norm": 0.7363675236701965, "kl": 0.2016907036304474, "learning_rate": 2.7934957109727224e-06, "loss": 0.0081, "reward": 1.4960312843322754, "reward_std": 0.6939598321914673, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.464781254529953, "step": 2895 }, { "completion_length": 149.03125, "epoch": 1.5494917067950775, "grad_norm": 6.092795372009277, "kl": 0.17325285077095032, "learning_rate": 2.791949940622854e-06, "loss": 0.0069, "reward": 1.4209375381469727, "reward_std": 0.88936847448349, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.38968750834465027, "step": 2896 }, { "completion_length": 152.71875, "epoch": 1.5500267522739433, "grad_norm": 0.29906415939331055, "kl": 0.11440536379814148, "learning_rate": 2.790404057107594e-06, "loss": 0.0046, "reward": 1.6015937328338623, "reward_std": 0.9708333611488342, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4140937328338623, "step": 2897 }, { "completion_length": 117.875, "epoch": 1.550561797752809, "grad_norm": 1.7480674982070923, "kl": 0.3137432336807251, "learning_rate": 2.7888580610261567e-06, "loss": 0.0125, "reward": 2.11328125, "reward_std": 0.7829906344413757, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.44140625, "step": 2898 }, { "completion_length": 124.96875, "epoch": 1.5510968432316747, "grad_norm": 4.8333916664123535, "kl": 0.29166191816329956, "learning_rate": 2.7873119529778e-06, "loss": 0.0117, "reward": 2.284843683242798, "reward_std": 0.5950005054473877, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4879687428474426, "step": 2899 }, { "completion_length": 143.1875, "epoch": 1.5516318887105403, "grad_norm": 0.6198129057884216, "kl": 0.14172612130641937, "learning_rate": 2.785765733561825e-06, "loss": 0.0057, "reward": 2.3957812786102295, "reward_std": 0.929802656173706, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4426562488079071, "step": 2900 }, { "completion_length": 138.6875, "epoch": 1.552166934189406, "grad_norm": 1.0525423288345337, "kl": 0.1839108169078827, "learning_rate": 2.784219403377575e-06, "loss": 0.0074, "reward": 2.03543758392334, "reward_std": 0.9301377534866333, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4416874647140503, "step": 2901 }, { "completion_length": 120.5625, "epoch": 1.5527019796682717, "grad_norm": 1.4437170028686523, "kl": 0.18287771940231323, "learning_rate": 2.7826729630244377e-06, "loss": 0.0073, "reward": 2.15625, "reward_std": 0.46389174461364746, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.5, "step": 2902 }, { "completion_length": 129.75, "epoch": 1.5532370251471375, "grad_norm": 0.7846359610557556, "kl": 0.1599637269973755, "learning_rate": 2.7811264131018434e-06, "loss": 0.0064, "reward": 2.1429686546325684, "reward_std": 0.8482103943824768, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4867187440395355, "step": 2903 }, { "completion_length": 104.46875, "epoch": 1.5537720706260032, "grad_norm": 0.799728274345398, "kl": 0.19626358151435852, "learning_rate": 2.7795797542092646e-06, "loss": 0.0079, "reward": 2.8060312271118164, "reward_std": 0.40029123425483704, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4935312569141388, "step": 2904 }, { "completion_length": 142.96875, "epoch": 1.554307116104869, "grad_norm": 0.9170252680778503, "kl": 0.20173268020153046, "learning_rate": 2.7780329869462154e-06, "loss": 0.0081, "reward": 1.8688125610351562, "reward_std": 0.5954122543334961, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4781875014305115, "step": 2905 }, { "completion_length": 111.1875, "epoch": 1.5548421615837347, "grad_norm": 1539.5079345703125, "kl": 10.80002212524414, "learning_rate": 2.776486111912252e-06, "loss": 0.432, "reward": 2.65234375, "reward_std": 1.0237622261047363, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.49609375, "step": 2906 }, { "completion_length": 153.03125, "epoch": 1.5553772070626004, "grad_norm": 0.7282060384750366, "kl": 0.152650386095047, "learning_rate": 2.7749391297069738e-06, "loss": 0.0061, "reward": 1.9284687042236328, "reward_std": 0.8513906002044678, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4284687638282776, "step": 2907 }, { "completion_length": 137.78125, "epoch": 1.5559122525414661, "grad_norm": 1.176946997642517, "kl": 0.1447724997997284, "learning_rate": 2.773392040930021e-06, "loss": 0.0058, "reward": 2.134406328201294, "reward_std": 1.0717136859893799, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46253126859664917, "step": 2908 }, { "completion_length": 118.59375, "epoch": 1.5564472980203319, "grad_norm": 1.2527220249176025, "kl": 0.17295733094215393, "learning_rate": 2.7718448461810733e-06, "loss": 0.0069, "reward": 1.8905625343322754, "reward_std": 0.5219457149505615, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.499937504529953, "step": 2909 }, { "completion_length": 140.5, "epoch": 1.5569823434991974, "grad_norm": 4.9245524406433105, "kl": 0.24536597728729248, "learning_rate": 2.7702975460598545e-06, "loss": 0.0098, "reward": 1.60546875, "reward_std": 0.6528253555297852, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.48046875, "step": 2910 }, { "completion_length": 142.625, "epoch": 1.5575173889780631, "grad_norm": 1.6501611471176147, "kl": 0.13922777771949768, "learning_rate": 2.7687501411661277e-06, "loss": 0.0056, "reward": 1.723312497138977, "reward_std": 0.9427756071090698, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.42643749713897705, "step": 2911 }, { "completion_length": 128.125, "epoch": 1.5580524344569289, "grad_norm": 0.8658198118209839, "kl": 0.183477520942688, "learning_rate": 2.767202632099695e-06, "loss": 0.0073, "reward": 2.1187500953674316, "reward_std": 0.9073437452316284, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4624999761581421, "step": 2912 }, { "completion_length": 130.0, "epoch": 1.5585874799357944, "grad_norm": 0.8627641797065735, "kl": 0.13174742460250854, "learning_rate": 2.7656550194604027e-06, "loss": 0.0053, "reward": 1.75390625, "reward_std": 0.5672817230224609, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48828125, "step": 2913 }, { "completion_length": 129.5625, "epoch": 1.5591225254146601, "grad_norm": 1.5623524188995361, "kl": 0.19506990909576416, "learning_rate": 2.7641073038481348e-06, "loss": 0.0078, "reward": 1.85365629196167, "reward_std": 0.7332762479782104, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.44740623235702515, "step": 2914 }, { "completion_length": 145.125, "epoch": 1.5596575708935259, "grad_norm": 0.7498164176940918, "kl": 0.13928355276584625, "learning_rate": 2.7625594858628135e-06, "loss": 0.0056, "reward": 1.9409375190734863, "reward_std": 0.7477842569351196, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47218751907348633, "step": 2915 }, { "completion_length": 145.46875, "epoch": 1.5601926163723916, "grad_norm": 0.7652363181114197, "kl": 0.18868757784366608, "learning_rate": 2.7610115661044046e-06, "loss": 0.0075, "reward": 1.711437463760376, "reward_std": 0.6908295750617981, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.46143749356269836, "step": 2916 }, { "completion_length": 137.1875, "epoch": 1.5607276618512573, "grad_norm": 1.6736540794372559, "kl": 0.19137559831142426, "learning_rate": 2.7594635451729114e-06, "loss": 0.0077, "reward": 2.0067811012268066, "reward_std": 0.925565242767334, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4755312502384186, "step": 2917 }, { "completion_length": 143.25, "epoch": 1.561262707330123, "grad_norm": 0.6150174736976624, "kl": 0.16523492336273193, "learning_rate": 2.757915423668376e-06, "loss": 0.0066, "reward": 1.9707812070846558, "reward_std": 0.7142000198364258, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.45515623688697815, "step": 2918 }, { "completion_length": 134.78125, "epoch": 1.5617977528089888, "grad_norm": 1.0277915000915527, "kl": 0.21140854060649872, "learning_rate": 2.75636720219088e-06, "loss": 0.0085, "reward": 2.0923123359680176, "reward_std": 0.8906586170196533, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4516874849796295, "step": 2919 }, { "completion_length": 111.53125, "epoch": 1.5623327982878545, "grad_norm": 1.0775140523910522, "kl": 0.21046459674835205, "learning_rate": 2.7548188813405437e-06, "loss": 0.0084, "reward": 2.7368125915527344, "reward_std": 1.0178179740905762, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4868125021457672, "step": 2920 }, { "completion_length": 147.375, "epoch": 1.5628678437667203, "grad_norm": 1.1235840320587158, "kl": 0.15613031387329102, "learning_rate": 2.7532704617175253e-06, "loss": 0.0062, "reward": 2.06040620803833, "reward_std": 1.0492455959320068, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.43540626764297485, "step": 2921 }, { "completion_length": 135.6875, "epoch": 1.563402889245586, "grad_norm": 1.28683602809906, "kl": 0.249400794506073, "learning_rate": 2.7517219439220237e-06, "loss": 0.01, "reward": 1.409999966621399, "reward_std": 0.3966575562953949, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4256249666213989, "step": 2922 }, { "completion_length": 136.71875, "epoch": 1.5639379347244515, "grad_norm": 1.1032741069793701, "kl": 0.17786775529384613, "learning_rate": 2.7501733285542727e-06, "loss": 0.0071, "reward": 2.278343677520752, "reward_std": 0.8950932025909424, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48146873712539673, "step": 2923 }, { "completion_length": 128.15625, "epoch": 1.5644729802033173, "grad_norm": 1.1840169429779053, "kl": 0.15800386667251587, "learning_rate": 2.748624616214546e-06, "loss": 0.0063, "reward": 1.7379062175750732, "reward_std": 0.5943506360054016, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.456656277179718, "step": 2924 }, { "completion_length": 124.28125, "epoch": 1.565008025682183, "grad_norm": 6.680614948272705, "kl": 0.4335630536079407, "learning_rate": 2.7470758075031536e-06, "loss": 0.0173, "reward": 2.010406255722046, "reward_std": 0.7055975198745728, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4635312557220459, "step": 2925 }, { "completion_length": 125.34375, "epoch": 1.5655430711610487, "grad_norm": 0.4522371292114258, "kl": 0.1581278145313263, "learning_rate": 2.745526903020443e-06, "loss": 0.0063, "reward": 2.1484375, "reward_std": 0.8004783987998962, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4765625, "step": 2926 }, { "completion_length": 148.3125, "epoch": 1.5660781166399143, "grad_norm": 0.707946240901947, "kl": 0.159323051571846, "learning_rate": 2.7439779033667995e-06, "loss": 0.0064, "reward": 1.9487812519073486, "reward_std": 1.0109336376190186, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.43315625190734863, "step": 2927 }, { "completion_length": 127.78125, "epoch": 1.56661316211878, "grad_norm": 1.9600614309310913, "kl": 0.4035106599330902, "learning_rate": 2.7424288091426467e-06, "loss": 0.0161, "reward": 2.4140625, "reward_std": 0.9468797445297241, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4765625, "step": 2928 }, { "completion_length": 122.8125, "epoch": 1.5671482075976457, "grad_norm": 1.3879555463790894, "kl": 0.17441187798976898, "learning_rate": 2.7408796209484406e-06, "loss": 0.007, "reward": 1.8639687299728394, "reward_std": 0.5108626484870911, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48896875977516174, "step": 2929 }, { "completion_length": 147.5625, "epoch": 1.5676832530765115, "grad_norm": 1.3161532878875732, "kl": 0.14799238741397858, "learning_rate": 2.7393303393846785e-06, "loss": 0.0059, "reward": 1.9359376430511475, "reward_std": 1.3024849891662598, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4203124940395355, "step": 2930 }, { "completion_length": 148.875, "epoch": 1.5682182985553772, "grad_norm": 0.6080257892608643, "kl": 0.12511751055717468, "learning_rate": 2.73778096505189e-06, "loss": 0.005, "reward": 2.2416563034057617, "reward_std": 0.6455854773521423, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46040624380111694, "step": 2931 }, { "completion_length": 126.78125, "epoch": 1.568753344034243, "grad_norm": 3.096454381942749, "kl": 0.24869313836097717, "learning_rate": 2.7362314985506433e-06, "loss": 0.0099, "reward": 2.270218849182129, "reward_std": 1.2770802974700928, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.45771875977516174, "step": 2932 }, { "completion_length": 155.0625, "epoch": 1.5692883895131087, "grad_norm": 2.5165231227874756, "kl": 0.20848652720451355, "learning_rate": 2.734681940481541e-06, "loss": 0.0083, "reward": 1.6673749685287476, "reward_std": 0.9019525051116943, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.41737499833106995, "step": 2933 }, { "completion_length": 136.84375, "epoch": 1.5698234349919744, "grad_norm": 0.9308695197105408, "kl": 0.18527670204639435, "learning_rate": 2.7331322914452218e-06, "loss": 0.0074, "reward": 1.8629374504089355, "reward_std": 1.0250320434570312, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4566875100135803, "step": 2934 }, { "completion_length": 112.4375, "epoch": 1.5703584804708401, "grad_norm": 0.6698063015937805, "kl": 0.22130954265594482, "learning_rate": 2.7315825520423576e-06, "loss": 0.0089, "reward": 2.70884370803833, "reward_std": 0.7880285978317261, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.47446876764297485, "step": 2935 }, { "completion_length": 124.5625, "epoch": 1.5708935259497059, "grad_norm": 0.6752370595932007, "kl": 0.15552040934562683, "learning_rate": 2.7300327228736595e-06, "loss": 0.0062, "reward": 2.0950000286102295, "reward_std": 0.8553524017333984, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4699999988079071, "step": 2936 }, { "completion_length": 106.875, "epoch": 1.5714285714285714, "grad_norm": 4.283224105834961, "kl": 0.48158183693885803, "learning_rate": 2.7284828045398705e-06, "loss": 0.0193, "reward": 2.3260936737060547, "reward_std": 0.6912247538566589, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48234376311302185, "step": 2937 }, { "completion_length": 151.3125, "epoch": 1.5719636169074371, "grad_norm": 1.1525205373764038, "kl": 0.33492690324783325, "learning_rate": 2.7269327976417676e-06, "loss": 0.0134, "reward": 1.7831250429153442, "reward_std": 1.1860216856002808, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.37687498331069946, "step": 2938 }, { "completion_length": 149.75, "epoch": 1.5724986623863029, "grad_norm": 0.8942763209342957, "kl": 0.17319583892822266, "learning_rate": 2.725382702780164e-06, "loss": 0.0069, "reward": 2.1243436336517334, "reward_std": 0.6779580116271973, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.38996875286102295, "step": 2939 }, { "completion_length": 131.46875, "epoch": 1.5730337078651684, "grad_norm": 0.9050469398498535, "kl": 0.22328269481658936, "learning_rate": 2.723832520555905e-06, "loss": 0.0089, "reward": 2.390625, "reward_std": 0.9127999544143677, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.453125, "step": 2940 }, { "completion_length": 131.28125, "epoch": 1.5735687533440341, "grad_norm": 1.4408166408538818, "kl": 0.2744303047657013, "learning_rate": 2.722282251569872e-06, "loss": 0.011, "reward": 1.1198749542236328, "reward_std": 0.363211452960968, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.4323750138282776, "step": 2941 }, { "completion_length": 123.9375, "epoch": 1.5741037988228999, "grad_norm": 3.0846149921417236, "kl": 0.2112511545419693, "learning_rate": 2.7207318964229796e-06, "loss": 0.0085, "reward": 2.3660311698913574, "reward_std": 0.757225751876831, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4285312294960022, "step": 2942 }, { "completion_length": 113.65625, "epoch": 1.5746388443017656, "grad_norm": 0.9210206866264343, "kl": 0.186343714594841, "learning_rate": 2.719181455716174e-06, "loss": 0.0075, "reward": 3.171875, "reward_std": 0.6115370392799377, "rewards/correctness_reward_func": 1.6875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.5, "step": 2943 }, { "completion_length": 123.65625, "epoch": 1.5751738897806313, "grad_norm": 1.0685604810714722, "kl": 0.2600732445716858, "learning_rate": 2.717630930050436e-06, "loss": 0.0104, "reward": 2.192124843597412, "reward_std": 0.8124563694000244, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.44212502241134644, "step": 2944 }, { "completion_length": 118.15625, "epoch": 1.575708935259497, "grad_norm": 1.1219353675842285, "kl": 0.24071934819221497, "learning_rate": 2.716080320026779e-06, "loss": 0.0096, "reward": 2.35756254196167, "reward_std": 0.9540398120880127, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.46693748235702515, "step": 2945 }, { "completion_length": 124.78125, "epoch": 1.5762439807383628, "grad_norm": 1.2032396793365479, "kl": 0.15625911951065063, "learning_rate": 2.714529626246249e-06, "loss": 0.0063, "reward": 2.4270312786102295, "reward_std": 0.3666943907737732, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4582812190055847, "step": 2946 }, { "completion_length": 116.8125, "epoch": 1.5767790262172285, "grad_norm": 0.497875839471817, "kl": 0.12698620557785034, "learning_rate": 2.712978849309926e-06, "loss": 0.0051, "reward": 2.2391562461853027, "reward_std": 0.6867412328720093, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47353124618530273, "step": 2947 }, { "completion_length": 110.3125, "epoch": 1.5773140716960943, "grad_norm": 1.402372121810913, "kl": 0.1810697615146637, "learning_rate": 2.711427989818919e-06, "loss": 0.0072, "reward": 3.0095937252044678, "reward_std": 0.765332043170929, "rewards/correctness_reward_func": 1.6875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.47834375500679016, "step": 2948 }, { "completion_length": 124.0, "epoch": 1.57784911717496, "grad_norm": 0.9291056990623474, "kl": 0.24690046906471252, "learning_rate": 2.7098770483743715e-06, "loss": 0.0099, "reward": 1.742843747138977, "reward_std": 0.7956242561340332, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.43034374713897705, "step": 2949 }, { "completion_length": 152.96875, "epoch": 1.5783841626538255, "grad_norm": 2.1571855545043945, "kl": 0.2022266834974289, "learning_rate": 2.7083260255774586e-06, "loss": 0.0081, "reward": 1.9415937662124634, "reward_std": 0.8255671262741089, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.3947187662124634, "step": 2950 }, { "completion_length": 134.84375, "epoch": 1.5789192081326913, "grad_norm": 0.875408947467804, "kl": 0.17043431103229523, "learning_rate": 2.706774922029386e-06, "loss": 0.0068, "reward": 2.067593812942505, "reward_std": 0.9504868388175964, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4894687533378601, "step": 2951 }, { "completion_length": 153.03125, "epoch": 1.579454253611557, "grad_norm": 1.3515597581863403, "kl": 0.15834935009479523, "learning_rate": 2.7052237383313914e-06, "loss": 0.0063, "reward": 1.3705312013626099, "reward_std": 0.6166348457336426, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.40178126096725464, "step": 2952 }, { "completion_length": 123.375, "epoch": 1.5799892990904227, "grad_norm": 0.4445866048336029, "kl": 0.14818796515464783, "learning_rate": 2.7036724750847434e-06, "loss": 0.0059, "reward": 1.4794687032699585, "reward_std": 0.5196923613548279, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.43259376287460327, "step": 2953 }, { "completion_length": 139.21875, "epoch": 1.5805243445692883, "grad_norm": 2.8882718086242676, "kl": 0.18260526657104492, "learning_rate": 2.70212113289074e-06, "loss": 0.0073, "reward": 2.0064687728881836, "reward_std": 1.24293851852417, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4127187430858612, "step": 2954 }, { "completion_length": 127.84375, "epoch": 1.581059390048154, "grad_norm": 4.690415859222412, "kl": 0.21087399125099182, "learning_rate": 2.7005697123507134e-06, "loss": 0.0084, "reward": 2.6768126487731934, "reward_std": 0.8516774773597717, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4893124997615814, "step": 2955 }, { "completion_length": 146.28125, "epoch": 1.5815944355270197, "grad_norm": 2.631610155105591, "kl": 0.16684448719024658, "learning_rate": 2.6990182140660225e-06, "loss": 0.0067, "reward": 1.8468749523162842, "reward_std": 1.010529637336731, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.40937501192092896, "step": 2956 }, { "completion_length": 129.21875, "epoch": 1.5821294810058855, "grad_norm": 1.6482176780700684, "kl": 0.16540127992630005, "learning_rate": 2.697466638638059e-06, "loss": 0.0066, "reward": 1.933343768119812, "reward_std": 0.8401152491569519, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4645937383174896, "step": 2957 }, { "completion_length": 155.0, "epoch": 1.5826645264847512, "grad_norm": 0.8297978043556213, "kl": 0.136812224984169, "learning_rate": 2.695914986668241e-06, "loss": 0.0055, "reward": 2.08984375, "reward_std": 0.6296411752700806, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.44921875, "step": 2958 }, { "completion_length": 152.8125, "epoch": 1.583199571963617, "grad_norm": 0.9456663131713867, "kl": 0.1771710216999054, "learning_rate": 2.6943632587580203e-06, "loss": 0.0071, "reward": 1.8182499408721924, "reward_std": 0.9025100469589233, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.42762500047683716, "step": 2959 }, { "completion_length": 128.09375, "epoch": 1.5837346174424827, "grad_norm": 1.5120009183883667, "kl": 0.21065184473991394, "learning_rate": 2.692811455508877e-06, "loss": 0.0084, "reward": 1.7418124675750732, "reward_std": 0.4007384181022644, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.47618749737739563, "step": 2960 }, { "completion_length": 120.375, "epoch": 1.5842696629213484, "grad_norm": 0.4498593211174011, "kl": 0.1673533320426941, "learning_rate": 2.6912595775223175e-06, "loss": 0.0067, "reward": 1.9737499952316284, "reward_std": 0.7229340076446533, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4737499952316284, "step": 2961 }, { "completion_length": 135.5625, "epoch": 1.5848047084002141, "grad_norm": 1.22760009765625, "kl": 0.16947519779205322, "learning_rate": 2.6897076253998816e-06, "loss": 0.0068, "reward": 1.3422499895095825, "reward_std": 0.41232067346572876, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4828749895095825, "step": 2962 }, { "completion_length": 149.53125, "epoch": 1.5853397538790799, "grad_norm": 1.1825590133666992, "kl": 0.16810624301433563, "learning_rate": 2.688155599743134e-06, "loss": 0.0067, "reward": 1.7567187547683716, "reward_std": 0.7000641226768494, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4598437547683716, "step": 2963 }, { "completion_length": 147.0, "epoch": 1.5858747993579454, "grad_norm": 1.035516381263733, "kl": 0.16001009941101074, "learning_rate": 2.6866035011536705e-06, "loss": 0.0064, "reward": 1.7449061870574951, "reward_std": 0.8829474449157715, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4480312466621399, "step": 2964 }, { "completion_length": 126.25, "epoch": 1.5864098448368111, "grad_norm": 0.5665059089660645, "kl": 0.16176649928092957, "learning_rate": 2.685051330233115e-06, "loss": 0.0065, "reward": 2.702531337738037, "reward_std": 0.5348912477493286, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46815624833106995, "step": 2965 }, { "completion_length": 120.6875, "epoch": 1.5869448903156769, "grad_norm": 0.4327903687953949, "kl": 0.14976799488067627, "learning_rate": 2.6834990875831174e-06, "loss": 0.006, "reward": 3.0390625, "reward_std": 0.6147754788398743, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4765625, "step": 2966 }, { "completion_length": 115.4375, "epoch": 1.5874799357945424, "grad_norm": 1.1694579124450684, "kl": 0.21993768215179443, "learning_rate": 2.6819467738053575e-06, "loss": 0.0088, "reward": 2.7275314331054688, "reward_std": 0.9555306434631348, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4775312542915344, "step": 2967 }, { "completion_length": 151.90625, "epoch": 1.5880149812734081, "grad_norm": 0.7933545708656311, "kl": 0.1269931197166443, "learning_rate": 2.680394389501541e-06, "loss": 0.0051, "reward": 1.9349687099456787, "reward_std": 0.9142060279846191, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4349687695503235, "step": 2968 }, { "completion_length": 157.15625, "epoch": 1.5885500267522739, "grad_norm": 0.9742212891578674, "kl": 0.18860280513763428, "learning_rate": 2.6788419352734013e-06, "loss": 0.0075, "reward": 1.3320937156677246, "reward_std": 0.8894503116607666, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.234375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.3477187156677246, "step": 2969 }, { "completion_length": 117.28125, "epoch": 1.5890850722311396, "grad_norm": 0.6940496563911438, "kl": 0.21257057785987854, "learning_rate": 2.677289411722702e-06, "loss": 0.0085, "reward": 2.3844685554504395, "reward_std": 0.8078437447547913, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4782187342643738, "step": 2970 }, { "completion_length": 160.625, "epoch": 1.5896201177100053, "grad_norm": 1.2091124057769775, "kl": 0.20541879534721375, "learning_rate": 2.675736819451229e-06, "loss": 0.0082, "reward": 1.820718765258789, "reward_std": 1.0880634784698486, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.3832187354564667, "step": 2971 }, { "completion_length": 140.9375, "epoch": 1.590155163188871, "grad_norm": 1.744682788848877, "kl": 0.321117103099823, "learning_rate": 2.674184159060797e-06, "loss": 0.0128, "reward": 1.4978437423706055, "reward_std": 0.851235032081604, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.41971874237060547, "step": 2972 }, { "completion_length": 124.59375, "epoch": 1.5906902086677368, "grad_norm": 0.7143811583518982, "kl": 0.18699871003627777, "learning_rate": 2.672631431153247e-06, "loss": 0.0075, "reward": 2.475874900817871, "reward_std": 0.7238146066665649, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.44462502002716064, "step": 2973 }, { "completion_length": 126.09375, "epoch": 1.5912252541466025, "grad_norm": 0.5011039972305298, "kl": 0.13838033378124237, "learning_rate": 2.6710786363304463e-06, "loss": 0.0055, "reward": 2.0234375, "reward_std": 0.7409582734107971, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4765625, "step": 2974 }, { "completion_length": 120.25, "epoch": 1.5917602996254683, "grad_norm": 1.5324645042419434, "kl": 0.1631508469581604, "learning_rate": 2.6695257751942878e-06, "loss": 0.0065, "reward": 2.2187812328338623, "reward_std": 0.49195435643196106, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4687812328338623, "step": 2975 }, { "completion_length": 150.8125, "epoch": 1.592295345104334, "grad_norm": 2.455862283706665, "kl": 0.13108403980731964, "learning_rate": 2.667972848346691e-06, "loss": 0.0052, "reward": 1.9345624446868896, "reward_std": 0.8276056051254272, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4033125042915344, "step": 2976 }, { "completion_length": 133.90625, "epoch": 1.5928303905831995, "grad_norm": 0.562185525894165, "kl": 0.12786106765270233, "learning_rate": 2.6664198563895983e-06, "loss": 0.0051, "reward": 2.305500030517578, "reward_std": 0.6205042004585266, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.43050000071525574, "step": 2977 }, { "completion_length": 132.75, "epoch": 1.5933654360620653, "grad_norm": 1.4530128240585327, "kl": 0.1661522090435028, "learning_rate": 2.6648667999249804e-06, "loss": 0.0066, "reward": 1.9989062547683716, "reward_std": 0.8680202960968018, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4520312547683716, "step": 2978 }, { "completion_length": 161.5, "epoch": 1.593900481540931, "grad_norm": 0.8338686227798462, "kl": 0.13015644252300262, "learning_rate": 2.6633136795548326e-06, "loss": 0.0052, "reward": 1.0873124599456787, "reward_std": 0.6942038536071777, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.203125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.25, "rewards/xmlcount_reward_func": 0.3216875195503235, "step": 2979 }, { "completion_length": 119.15625, "epoch": 1.5944355270197965, "grad_norm": 0.6631385684013367, "kl": 0.14907018840312958, "learning_rate": 2.6617604958811714e-06, "loss": 0.006, "reward": 2.1391563415527344, "reward_std": 0.892558753490448, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4829062521457672, "step": 2980 }, { "completion_length": 115.375, "epoch": 1.5949705724986623, "grad_norm": 1.4784328937530518, "kl": 0.1668834388256073, "learning_rate": 2.6602072495060436e-06, "loss": 0.0067, "reward": 2.80859375, "reward_std": 0.7882996797561646, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.49609375, "step": 2981 }, { "completion_length": 103.90625, "epoch": 1.595505617977528, "grad_norm": 0.4756526052951813, "kl": 0.1949315369129181, "learning_rate": 2.658653941031516e-06, "loss": 0.0078, "reward": 2.515625, "reward_std": 0.8182295560836792, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.5, "step": 2982 }, { "completion_length": 130.375, "epoch": 1.5960406634563937, "grad_norm": 1.5856231451034546, "kl": 0.18336798250675201, "learning_rate": 2.6571005710596802e-06, "loss": 0.0073, "reward": 2.09765625, "reward_std": 0.8427629470825195, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.48828125, "step": 2983 }, { "completion_length": 149.1875, "epoch": 1.5965757089352595, "grad_norm": 0.765586793422699, "kl": 0.17159852385520935, "learning_rate": 2.655547140192652e-06, "loss": 0.0069, "reward": 2.135406255722046, "reward_std": 0.8668866157531738, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4479062557220459, "step": 2984 }, { "completion_length": 148.21875, "epoch": 1.5971107544141252, "grad_norm": 1.2412117719650269, "kl": 0.15271936357021332, "learning_rate": 2.6539936490325723e-06, "loss": 0.0061, "reward": 1.80859375, "reward_std": 0.37917083501815796, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46484375, "step": 2985 }, { "completion_length": 136.59375, "epoch": 1.597645799892991, "grad_norm": 0.9581247568130493, "kl": 0.23216655850410461, "learning_rate": 2.6524400981816032e-06, "loss": 0.0093, "reward": 1.9296562671661377, "reward_std": 0.9903014898300171, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4765312671661377, "step": 2986 }, { "completion_length": 95.875, "epoch": 1.5981808453718567, "grad_norm": 0.8973239064216614, "kl": 0.2718549072742462, "learning_rate": 2.6508864882419304e-06, "loss": 0.0109, "reward": 2.609250068664551, "reward_std": 0.6915702223777771, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.499875009059906, "step": 2987 }, { "completion_length": 142.0, "epoch": 1.5987158908507224, "grad_norm": 0.7871801853179932, "kl": 0.16866236925125122, "learning_rate": 2.6493328198157642e-06, "loss": 0.0067, "reward": 1.886625051498413, "reward_std": 1.1877460479736328, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4178749918937683, "step": 2988 }, { "completion_length": 112.0, "epoch": 1.5992509363295881, "grad_norm": 0.8750309348106384, "kl": 0.19488514959812164, "learning_rate": 2.647779093505335e-06, "loss": 0.0078, "reward": 2.350343704223633, "reward_std": 0.6252588033676147, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4753437638282776, "step": 2989 }, { "completion_length": 149.84375, "epoch": 1.5997859818084539, "grad_norm": 2.5257952213287354, "kl": 0.17873318493366241, "learning_rate": 2.6462253099128978e-06, "loss": 0.0071, "reward": 2.0360312461853027, "reward_std": 1.0775258541107178, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.41103124618530273, "step": 2990 }, { "completion_length": 131.9375, "epoch": 1.6003210272873194, "grad_norm": 15.889896392822266, "kl": 2.9956982135772705, "learning_rate": 2.6446714696407283e-06, "loss": 0.1198, "reward": 2.5893125534057617, "reward_std": 1.1454403400421143, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.47993749380111694, "step": 2991 }, { "completion_length": 125.71875, "epoch": 1.6008560727661851, "grad_norm": 0.5440523624420166, "kl": 0.1555633246898651, "learning_rate": 2.6431175732911234e-06, "loss": 0.0062, "reward": 2.20703125, "reward_std": 0.770760715007782, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48828125, "step": 2992 }, { "completion_length": 141.1875, "epoch": 1.6013911182450509, "grad_norm": 1.112788200378418, "kl": 0.3016206622123718, "learning_rate": 2.641563621466406e-06, "loss": 0.0121, "reward": 2.033843755722046, "reward_std": 1.0599241256713867, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4244687557220459, "step": 2993 }, { "completion_length": 119.15625, "epoch": 1.6019261637239164, "grad_norm": 3.4436023235321045, "kl": 0.2764543294906616, "learning_rate": 2.6400096147689162e-06, "loss": 0.0111, "reward": 1.6975312232971191, "reward_std": 0.5089016556739807, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.44753125309944153, "step": 2994 }, { "completion_length": 117.03125, "epoch": 1.6024612092027821, "grad_norm": 0.7939090132713318, "kl": 0.1848311871290207, "learning_rate": 2.6384555538010164e-06, "loss": 0.0074, "reward": 2.1864686012268066, "reward_std": 1.017529010772705, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4833437502384186, "step": 2995 }, { "completion_length": 146.375, "epoch": 1.6029962546816479, "grad_norm": 1.0161209106445312, "kl": 0.1519831120967865, "learning_rate": 2.636901439165091e-06, "loss": 0.0061, "reward": 1.4641249179840088, "reward_std": 0.8786452412605286, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.41724997758865356, "step": 2996 }, { "completion_length": 127.0, "epoch": 1.6035313001605136, "grad_norm": 0.7520081996917725, "kl": 0.17654669284820557, "learning_rate": 2.6353472714635443e-06, "loss": 0.0071, "reward": 1.984375, "reward_std": 0.5302724242210388, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46875, "step": 2997 }, { "completion_length": 139.96875, "epoch": 1.6040663456393793, "grad_norm": 16.849563598632812, "kl": 1.3815211057662964, "learning_rate": 2.6337930512988014e-06, "loss": 0.0553, "reward": 1.8313124179840088, "reward_std": 0.9486573934555054, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47193747758865356, "step": 2998 }, { "completion_length": 128.34375, "epoch": 1.604601391118245, "grad_norm": 0.8334498405456543, "kl": 0.19381865859031677, "learning_rate": 2.6322387792733085e-06, "loss": 0.0078, "reward": 2.236062526702881, "reward_std": 0.7031677961349487, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47043749690055847, "step": 2999 }, { "completion_length": 149.6875, "epoch": 1.6051364365971108, "grad_norm": 1.1307717561721802, "kl": 0.15979716181755066, "learning_rate": 2.63068445598953e-06, "loss": 0.0064, "reward": 2.155656337738037, "reward_std": 1.1975162029266357, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.45253124833106995, "step": 3000 }, { "completion_length": 128.21875, "epoch": 1.6056714820759765, "grad_norm": 15.140463829040527, "kl": 0.812584638595581, "learning_rate": 2.6291300820499525e-06, "loss": 0.0325, "reward": 2.684781312942505, "reward_std": 0.7370704412460327, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4816562533378601, "step": 3001 }, { "completion_length": 131.4375, "epoch": 1.6062065275548423, "grad_norm": 972488896.0, "kl": 1428392.75, "learning_rate": 2.6275756580570803e-06, "loss": 57135.7109, "reward": 2.7766876220703125, "reward_std": 0.7580993175506592, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47981250286102295, "step": 3002 }, { "completion_length": 145.75, "epoch": 1.606741573033708, "grad_norm": 1.0816235542297363, "kl": 0.1756131947040558, "learning_rate": 2.6260211846134387e-06, "loss": 0.007, "reward": 1.6352499723434448, "reward_std": 0.6353256106376648, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4633750021457672, "step": 3003 }, { "completion_length": 146.40625, "epoch": 1.6072766185125735, "grad_norm": 0.9945052862167358, "kl": 0.18711355328559875, "learning_rate": 2.6244666623215714e-06, "loss": 0.0075, "reward": 2.23828125, "reward_std": 0.772544264793396, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.47265625, "step": 3004 }, { "completion_length": 130.71875, "epoch": 1.6078116639914393, "grad_norm": 1.3330390453338623, "kl": 0.17533943057060242, "learning_rate": 2.6229120917840406e-06, "loss": 0.007, "reward": 1.6967188119888306, "reward_std": 0.6791276931762695, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4467187523841858, "step": 3005 }, { "completion_length": 142.9375, "epoch": 1.608346709470305, "grad_norm": 0.6713166236877441, "kl": 0.1930074393749237, "learning_rate": 2.621357473603427e-06, "loss": 0.0077, "reward": 2.290062427520752, "reward_std": 0.9400936365127563, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.44631248712539673, "step": 3006 }, { "completion_length": 124.5, "epoch": 1.6088817549491705, "grad_norm": 1.1851391792297363, "kl": 0.19295579195022583, "learning_rate": 2.619802808382332e-06, "loss": 0.0077, "reward": 2.7517499923706055, "reward_std": 0.8102045059204102, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48612499237060547, "step": 3007 }, { "completion_length": 133.28125, "epoch": 1.6094168004280363, "grad_norm": 1.8906837701797485, "kl": 0.16636709868907928, "learning_rate": 2.6182480967233727e-06, "loss": 0.0067, "reward": 2.0583748817443848, "reward_std": 0.47956418991088867, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4646250009536743, "step": 3008 }, { "completion_length": 141.75, "epoch": 1.609951845906902, "grad_norm": 56.33926010131836, "kl": 0.4299962520599365, "learning_rate": 2.616693339229184e-06, "loss": 0.0172, "reward": 1.686187505722046, "reward_std": 0.7506657838821411, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4205625057220459, "step": 3009 }, { "completion_length": 135.15625, "epoch": 1.6104868913857677, "grad_norm": 1.8098692893981934, "kl": 0.24929165840148926, "learning_rate": 2.6151385365024216e-06, "loss": 0.01, "reward": 2.0078125, "reward_std": 0.5321882963180542, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4765625, "step": 3010 }, { "completion_length": 119.03125, "epoch": 1.6110219368646335, "grad_norm": 1.0776479244232178, "kl": 0.2252005636692047, "learning_rate": 2.613583689145756e-06, "loss": 0.009, "reward": 2.151062488555908, "reward_std": 0.898082971572876, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4635624885559082, "step": 3011 }, { "completion_length": 139.28125, "epoch": 1.6115569823434992, "grad_norm": 1.0363134145736694, "kl": 0.14068175852298737, "learning_rate": 2.6120287977618764e-06, "loss": 0.0056, "reward": 2.0474061965942383, "reward_std": 1.1465210914611816, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.42240625619888306, "step": 3012 }, { "completion_length": 150.75, "epoch": 1.612092027822365, "grad_norm": 0.9792031049728394, "kl": 0.14151886105537415, "learning_rate": 2.6104738629534877e-06, "loss": 0.0057, "reward": 2.090250015258789, "reward_std": 0.844306468963623, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.44962501525878906, "step": 3013 }, { "completion_length": 120.0, "epoch": 1.6126270733012307, "grad_norm": 0.8921517133712769, "kl": 0.26191115379333496, "learning_rate": 2.6089188853233132e-06, "loss": 0.0105, "reward": 2.6236562728881836, "reward_std": 0.9562890529632568, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4830312430858612, "step": 3014 }, { "completion_length": 144.0625, "epoch": 1.6131621187800964, "grad_norm": 0.6850647926330566, "kl": 0.15254375338554382, "learning_rate": 2.6073638654740906e-06, "loss": 0.0061, "reward": 2.2100000381469727, "reward_std": 0.8707454800605774, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.44437500834465027, "step": 3015 }, { "completion_length": 119.625, "epoch": 1.6136971642589621, "grad_norm": 2.5896217823028564, "kl": 0.2954239845275879, "learning_rate": 2.6058088040085765e-06, "loss": 0.0118, "reward": 2.0750937461853027, "reward_std": 0.9403489232063293, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48134374618530273, "step": 3016 }, { "completion_length": 125.5625, "epoch": 1.6142322097378277, "grad_norm": 1.2754822969436646, "kl": 0.20004937052726746, "learning_rate": 2.6042537015295433e-06, "loss": 0.008, "reward": 2.100874900817871, "reward_std": 0.9638927578926086, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46024999022483826, "step": 3017 }, { "completion_length": 153.28125, "epoch": 1.6147672552166934, "grad_norm": 0.9042937159538269, "kl": 0.1811610460281372, "learning_rate": 2.602698558639777e-06, "loss": 0.0072, "reward": 1.8093438148498535, "reward_std": 0.7486664652824402, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46559375524520874, "step": 3018 }, { "completion_length": 134.78125, "epoch": 1.6153023006955591, "grad_norm": 7.266659259796143, "kl": 0.5082975625991821, "learning_rate": 2.6011433759420818e-06, "loss": 0.0203, "reward": 2.1767187118530273, "reward_std": 0.6613441705703735, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4267187714576721, "step": 3019 }, { "completion_length": 128.96875, "epoch": 1.6158373461744249, "grad_norm": 1.1094552278518677, "kl": 0.24742256104946136, "learning_rate": 2.5995881540392753e-06, "loss": 0.0099, "reward": 1.9984688758850098, "reward_std": 0.9146263599395752, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4672187566757202, "step": 3020 }, { "completion_length": 130.375, "epoch": 1.6163723916532904, "grad_norm": 0.6612251400947571, "kl": 0.16352882981300354, "learning_rate": 2.598032893534192e-06, "loss": 0.0065, "reward": 2.03125, "reward_std": 0.8068678975105286, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.5, "step": 3021 }, { "completion_length": 116.34375, "epoch": 1.6169074371321561, "grad_norm": 0.8455619215965271, "kl": 0.18658240139484406, "learning_rate": 2.596477595029681e-06, "loss": 0.0075, "reward": 2.4632811546325684, "reward_std": 0.7893432378768921, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4945312440395355, "step": 3022 }, { "completion_length": 138.78125, "epoch": 1.6174424826110219, "grad_norm": 3.567774534225464, "kl": 0.20630870759487152, "learning_rate": 2.594922259128605e-06, "loss": 0.0083, "reward": 2.2421875, "reward_std": 0.7512622475624084, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4765625, "step": 3023 }, { "completion_length": 139.21875, "epoch": 1.6179775280898876, "grad_norm": 1.438726544380188, "kl": 0.1746216118335724, "learning_rate": 2.593366886433843e-06, "loss": 0.007, "reward": 2.198499917984009, "reward_std": 0.5288436412811279, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.40162500739097595, "step": 3024 }, { "completion_length": 114.25, "epoch": 1.6185125735687533, "grad_norm": 3.219280242919922, "kl": 0.30703866481781006, "learning_rate": 2.591811477548286e-06, "loss": 0.0123, "reward": 2.30078125, "reward_std": 0.5203295946121216, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47265625, "step": 3025 }, { "completion_length": 138.125, "epoch": 1.619047619047619, "grad_norm": 1.5918817520141602, "kl": 0.31748324632644653, "learning_rate": 2.590256033074841e-06, "loss": 0.0127, "reward": 2.4468750953674316, "reward_std": 0.9528295993804932, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4468750059604645, "step": 3026 }, { "completion_length": 123.71875, "epoch": 1.6195826645264848, "grad_norm": 2.052074909210205, "kl": 0.46620044112205505, "learning_rate": 2.5887005536164287e-06, "loss": 0.0186, "reward": 1.589687466621399, "reward_std": 0.6792142391204834, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4646874964237213, "step": 3027 }, { "completion_length": 144.75, "epoch": 1.6201177100053505, "grad_norm": 0.8894802927970886, "kl": 0.20817889273166656, "learning_rate": 2.587145039775982e-06, "loss": 0.0083, "reward": 1.7490625381469727, "reward_std": 0.772903561592102, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.45218750834465027, "step": 3028 }, { "completion_length": 113.90625, "epoch": 1.6206527554842163, "grad_norm": 0.6108150482177734, "kl": 0.16898313164710999, "learning_rate": 2.5855894921564473e-06, "loss": 0.0068, "reward": 2.78125, "reward_std": 0.7793499231338501, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.5, "step": 3029 }, { "completion_length": 123.375, "epoch": 1.621187800963082, "grad_norm": 1.7176432609558105, "kl": 0.15831685066223145, "learning_rate": 2.5840339113607853e-06, "loss": 0.0063, "reward": 2.15625, "reward_std": 0.3471629023551941, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.5, "step": 3030 }, { "completion_length": 123.53125, "epoch": 1.6217228464419475, "grad_norm": 4.191864967346191, "kl": 0.32004421949386597, "learning_rate": 2.58247829799197e-06, "loss": 0.0128, "reward": 2.370374917984009, "reward_std": 1.112941861152649, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46412500739097595, "step": 3031 }, { "completion_length": 121.875, "epoch": 1.6222578919208133, "grad_norm": 0.9032105803489685, "kl": 0.20759716629981995, "learning_rate": 2.5809226526529846e-06, "loss": 0.0083, "reward": 2.213562488555908, "reward_std": 1.005051851272583, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4791874885559082, "step": 3032 }, { "completion_length": 137.0, "epoch": 1.622792937399679, "grad_norm": 2.576554536819458, "kl": 0.1974070966243744, "learning_rate": 2.579366975946829e-06, "loss": 0.0079, "reward": 1.8485625982284546, "reward_std": 0.7736194729804993, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4579375088214874, "step": 3033 }, { "completion_length": 142.4375, "epoch": 1.6233279828785445, "grad_norm": 2.274554491043091, "kl": 0.15091823041439056, "learning_rate": 2.5778112684765127e-06, "loss": 0.006, "reward": 1.3523750305175781, "reward_std": 0.8620144724845886, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.39925000071525574, "step": 3034 }, { "completion_length": 126.125, "epoch": 1.6238630283574103, "grad_norm": 2.314143419265747, "kl": 0.3154590129852295, "learning_rate": 2.5762555308450573e-06, "loss": 0.0126, "reward": 1.6608749628067017, "reward_std": 0.5909897685050964, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.41087499260902405, "step": 3035 }, { "completion_length": 134.78125, "epoch": 1.624398073836276, "grad_norm": 1.133813500404358, "kl": 0.23121440410614014, "learning_rate": 2.574699763655498e-06, "loss": 0.0092, "reward": 1.428781270980835, "reward_std": 0.801803708076477, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.44440627098083496, "step": 3036 }, { "completion_length": 110.40625, "epoch": 1.6249331193151417, "grad_norm": 1.4082050323486328, "kl": 0.2146352082490921, "learning_rate": 2.5731439675108772e-06, "loss": 0.0086, "reward": 2.739874839782715, "reward_std": 0.5141955614089966, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48987501859664917, "step": 3037 }, { "completion_length": 144.15625, "epoch": 1.6254681647940075, "grad_norm": 5.604970455169678, "kl": 0.4399428367614746, "learning_rate": 2.5715881430142542e-06, "loss": 0.0176, "reward": 1.6268749237060547, "reward_std": 0.5589946508407593, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.42374998331069946, "step": 3038 }, { "completion_length": 107.90625, "epoch": 1.6260032102728732, "grad_norm": 2.046415090560913, "kl": 0.20836922526359558, "learning_rate": 2.5700322907686947e-06, "loss": 0.0083, "reward": 2.1961874961853027, "reward_std": 0.6068994402885437, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47743749618530273, "step": 3039 }, { "completion_length": 134.4375, "epoch": 1.626538255751739, "grad_norm": 2.1780588626861572, "kl": 0.21752864122390747, "learning_rate": 2.568476411377277e-06, "loss": 0.0087, "reward": 2.5170936584472656, "reward_std": 0.739395260810852, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4702187478542328, "step": 3040 }, { "completion_length": 135.03125, "epoch": 1.6270733012306047, "grad_norm": 3.0969605445861816, "kl": 0.15044143795967102, "learning_rate": 2.5669205054430907e-06, "loss": 0.006, "reward": 2.7090938091278076, "reward_std": 0.9305894374847412, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47471874952316284, "step": 3041 }, { "completion_length": 145.0625, "epoch": 1.6276083467094704, "grad_norm": 0.770773708820343, "kl": 0.14624091982841492, "learning_rate": 2.565364573569234e-06, "loss": 0.0058, "reward": 1.613531231880188, "reward_std": 0.7594902515411377, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4572812616825104, "step": 3042 }, { "completion_length": 112.4375, "epoch": 1.6281433921883361, "grad_norm": 0.945700466632843, "kl": 0.20388971269130707, "learning_rate": 2.5638086163588157e-06, "loss": 0.0082, "reward": 2.4420313835144043, "reward_std": 0.8100603818893433, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.47328123450279236, "step": 3043 }, { "completion_length": 109.46875, "epoch": 1.6286784376672017, "grad_norm": 0.858544111251831, "kl": 0.20105230808258057, "learning_rate": 2.5622526344149546e-06, "loss": 0.008, "reward": 2.4769375324249268, "reward_std": 0.8351312279701233, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46131250262260437, "step": 3044 }, { "completion_length": 149.34375, "epoch": 1.6292134831460674, "grad_norm": 1.8366143703460693, "kl": 0.16857443749904633, "learning_rate": 2.5606966283407804e-06, "loss": 0.0067, "reward": 1.858437418937683, "reward_std": 0.9640200138092041, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.3740624785423279, "step": 3045 }, { "completion_length": 131.4375, "epoch": 1.6297485286249331, "grad_norm": 1.0531107187271118, "kl": 0.1565093994140625, "learning_rate": 2.559140598739429e-06, "loss": 0.0063, "reward": 1.9561876058578491, "reward_std": 0.8084858655929565, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.40931248664855957, "step": 3046 }, { "completion_length": 134.4375, "epoch": 1.6302835741037989, "grad_norm": 0.9308398365974426, "kl": 0.2191639095544815, "learning_rate": 2.557584546214049e-06, "loss": 0.0088, "reward": 1.5716562271118164, "reward_std": 0.7628875970840454, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.4154062569141388, "step": 3047 }, { "completion_length": 146.65625, "epoch": 1.6308186195826644, "grad_norm": 1.0237106084823608, "kl": 0.15004783868789673, "learning_rate": 2.556028471367796e-06, "loss": 0.006, "reward": 1.6301875114440918, "reward_std": 0.5927785038948059, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4114375114440918, "step": 3048 }, { "completion_length": 124.5625, "epoch": 1.6313536650615301, "grad_norm": 0.7531304359436035, "kl": 0.16805800795555115, "learning_rate": 2.554472374803832e-06, "loss": 0.0067, "reward": 2.0234375, "reward_std": 0.7205796241760254, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4765625, "step": 3049 }, { "completion_length": 118.71875, "epoch": 1.6318887105403959, "grad_norm": 0.8578194975852966, "kl": 0.17869165539741516, "learning_rate": 2.552916257125333e-06, "loss": 0.0071, "reward": 2.26771879196167, "reward_std": 0.7244406938552856, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.47084373235702515, "step": 3050 }, { "completion_length": 149.5, "epoch": 1.6324237560192616, "grad_norm": 1.4716538190841675, "kl": 0.14046907424926758, "learning_rate": 2.5513601189354786e-06, "loss": 0.0056, "reward": 1.5812187194824219, "reward_std": 0.9717457294464111, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.44059374928474426, "step": 3051 }, { "completion_length": 121.8125, "epoch": 1.6329588014981273, "grad_norm": 1.1485710144042969, "kl": 0.18701885640621185, "learning_rate": 2.5498039608374573e-06, "loss": 0.0075, "reward": 2.3996875286102295, "reward_std": 0.8348458409309387, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4778124988079071, "step": 3052 }, { "completion_length": 130.34375, "epoch": 1.633493846976993, "grad_norm": 0.37434127926826477, "kl": 0.16258743405342102, "learning_rate": 2.5482477834344676e-06, "loss": 0.0065, "reward": 2.962156295776367, "reward_std": 0.6045994758605957, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4934062361717224, "step": 3053 }, { "completion_length": 138.0625, "epoch": 1.6340288924558588, "grad_norm": 6.494659900665283, "kl": 0.4716867506504059, "learning_rate": 2.5466915873297116e-06, "loss": 0.0189, "reward": 1.414156198501587, "reward_std": 0.6688860654830933, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4454062581062317, "step": 3054 }, { "completion_length": 131.46875, "epoch": 1.6345639379347245, "grad_norm": 59.311180114746094, "kl": 1.7632713317871094, "learning_rate": 2.5451353731264013e-06, "loss": 0.0705, "reward": 1.8319063186645508, "reward_std": 0.7164047956466675, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.488156259059906, "step": 3055 }, { "completion_length": 114.78125, "epoch": 1.6350989834135903, "grad_norm": 0.7898251414299011, "kl": 0.17342045903205872, "learning_rate": 2.543579141427756e-06, "loss": 0.0069, "reward": 2.310906171798706, "reward_std": 0.7900955080986023, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48278123140335083, "step": 3056 }, { "completion_length": 127.09375, "epoch": 1.635634028892456, "grad_norm": 0.683124303817749, "kl": 0.1879028081893921, "learning_rate": 2.5420228928370007e-06, "loss": 0.0075, "reward": 2.287968635559082, "reward_std": 1.0723769664764404, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4598437547683716, "step": 3057 }, { "completion_length": 152.28125, "epoch": 1.6361690743713215, "grad_norm": 4.4713921546936035, "kl": 0.19444677233695984, "learning_rate": 2.540466627957366e-06, "loss": 0.0078, "reward": 1.7358437776565552, "reward_std": 0.6524290442466736, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.4077187776565552, "step": 3058 }, { "completion_length": 128.28125, "epoch": 1.6367041198501873, "grad_norm": 0.845873236656189, "kl": 0.15785843133926392, "learning_rate": 2.538910347392092e-06, "loss": 0.0063, "reward": 1.7909687757492065, "reward_std": 0.43392544984817505, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.46284374594688416, "step": 3059 }, { "completion_length": 169.78125, "epoch": 1.637239165329053, "grad_norm": 1.1939122676849365, "kl": 0.14006993174552917, "learning_rate": 2.53735405174442e-06, "loss": 0.0056, "reward": 1.6132187843322754, "reward_std": 0.8770310878753662, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.410093754529953, "step": 3060 }, { "completion_length": 138.96875, "epoch": 1.6377742108079185, "grad_norm": 0.8058537840843201, "kl": 0.18146297335624695, "learning_rate": 2.535797741617603e-06, "loss": 0.0073, "reward": 1.8769375085830688, "reward_std": 0.6708225011825562, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.42381250858306885, "step": 3061 }, { "completion_length": 126.46875, "epoch": 1.6383092562867843, "grad_norm": 1.4552996158599854, "kl": 0.30827513337135315, "learning_rate": 2.534241417614896e-06, "loss": 0.0123, "reward": 1.8450937271118164, "reward_std": 0.9701507091522217, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4388437569141388, "step": 3062 }, { "completion_length": 141.0, "epoch": 1.63884430176565, "grad_norm": 0.8456292152404785, "kl": 0.2146884799003601, "learning_rate": 2.5326850803395577e-06, "loss": 0.0086, "reward": 1.41796875, "reward_std": 0.6835616827011108, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46484375, "step": 3063 }, { "completion_length": 139.75, "epoch": 1.6393793472445157, "grad_norm": 0.8995453715324402, "kl": 0.15300646424293518, "learning_rate": 2.531128730394857e-06, "loss": 0.0061, "reward": 1.87890625, "reward_std": 1.1539627313613892, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.44140625, "step": 3064 }, { "completion_length": 130.0625, "epoch": 1.6399143927233815, "grad_norm": 0.9801468849182129, "kl": 0.17033343017101288, "learning_rate": 2.529572368384064e-06, "loss": 0.0068, "reward": 2.639343738555908, "reward_std": 0.6227976083755493, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4518437385559082, "step": 3065 }, { "completion_length": 105.1875, "epoch": 1.6404494382022472, "grad_norm": 0.4977405369281769, "kl": 0.17320287227630615, "learning_rate": 2.5280159949104537e-06, "loss": 0.0069, "reward": 2.461625099182129, "reward_std": 0.27641987800598145, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.49287500977516174, "step": 3066 }, { "completion_length": 116.8125, "epoch": 1.640984483681113, "grad_norm": 12.596583366394043, "kl": 0.7275682091712952, "learning_rate": 2.5264596105773077e-06, "loss": 0.0291, "reward": 2.2802813053131104, "reward_std": 0.2978418469429016, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4677812457084656, "step": 3067 }, { "completion_length": 138.3125, "epoch": 1.6415195291599787, "grad_norm": 2.501781463623047, "kl": 0.18547451496124268, "learning_rate": 2.524903215987909e-06, "loss": 0.0074, "reward": 1.4608436822891235, "reward_std": 0.8582541942596436, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4452187418937683, "step": 3068 }, { "completion_length": 134.65625, "epoch": 1.6420545746388444, "grad_norm": 6.135666370391846, "kl": 0.44992268085479736, "learning_rate": 2.5233468117455462e-06, "loss": 0.018, "reward": 2.757625102996826, "reward_std": 0.8710789680480957, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.445125013589859, "step": 3069 }, { "completion_length": 120.875, "epoch": 1.6425896201177101, "grad_norm": 1.5369125604629517, "kl": 0.19300930202007294, "learning_rate": 2.5217903984535123e-06, "loss": 0.0077, "reward": 1.84375, "reward_std": 0.8032137155532837, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.5, "step": 3070 }, { "completion_length": 154.03125, "epoch": 1.6431246655965757, "grad_norm": 1.0025553703308105, "kl": 0.1854628026485443, "learning_rate": 2.520233976715103e-06, "loss": 0.0074, "reward": 1.5436562299728394, "reward_std": 0.5726227760314941, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.41865622997283936, "step": 3071 }, { "completion_length": 176.0625, "epoch": 1.6436597110754414, "grad_norm": 1.3382976055145264, "kl": 0.17167414724826813, "learning_rate": 2.5186775471336157e-06, "loss": 0.0069, "reward": 0.9446874856948853, "reward_std": 0.5295165777206421, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.33531251549720764, "step": 3072 }, { "completion_length": 154.84375, "epoch": 1.6441947565543071, "grad_norm": 0.8189997673034668, "kl": 0.17800627648830414, "learning_rate": 2.5171211103123533e-06, "loss": 0.0071, "reward": 1.698468804359436, "reward_std": 0.9520377516746521, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.41721874475479126, "step": 3073 }, { "completion_length": 150.1875, "epoch": 1.6447298020331729, "grad_norm": 176.36386108398438, "kl": 1.280846118927002, "learning_rate": 2.5155646668546207e-06, "loss": 0.0512, "reward": 1.6008750200271606, "reward_std": 0.7549490332603455, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.42900002002716064, "step": 3074 }, { "completion_length": 133.75, "epoch": 1.6452648475120384, "grad_norm": 1.4399843215942383, "kl": 0.17276901006698608, "learning_rate": 2.514008217363725e-06, "loss": 0.0069, "reward": 2.731874942779541, "reward_std": 0.9112969636917114, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4662500023841858, "step": 3075 }, { "completion_length": 124.0, "epoch": 1.6457998929909041, "grad_norm": 1.8868818283081055, "kl": 0.18546167016029358, "learning_rate": 2.512451762442976e-06, "loss": 0.0074, "reward": 2.2845001220703125, "reward_std": 0.7232925891876221, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48762500286102295, "step": 3076 }, { "completion_length": 144.53125, "epoch": 1.6463349384697699, "grad_norm": 8.452707290649414, "kl": 0.541367769241333, "learning_rate": 2.5108953026956843e-06, "loss": 0.0217, "reward": 1.522281289100647, "reward_std": 0.47929561138153076, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4754062592983246, "step": 3077 }, { "completion_length": 129.59375, "epoch": 1.6468699839486356, "grad_norm": 5.146972179412842, "kl": 0.21020865440368652, "learning_rate": 2.5093388387251656e-06, "loss": 0.0084, "reward": 1.8063125610351562, "reward_std": 0.7538280487060547, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4625625014305115, "step": 3078 }, { "completion_length": 109.59375, "epoch": 1.6474050294275013, "grad_norm": 0.9019163846969604, "kl": 0.1794697642326355, "learning_rate": 2.5077823711347338e-06, "loss": 0.0072, "reward": 2.671875, "reward_std": 0.5344237685203552, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 3079 }, { "completion_length": 145.9375, "epoch": 1.647940074906367, "grad_norm": 1.3523530960083008, "kl": 0.13887405395507812, "learning_rate": 2.506225900527705e-06, "loss": 0.0056, "reward": 1.6796875, "reward_std": 0.9666056632995605, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4609375, "step": 3080 }, { "completion_length": 124.71875, "epoch": 1.6484751203852328, "grad_norm": 1.4583536386489868, "kl": 0.1762717068195343, "learning_rate": 2.504669427507399e-06, "loss": 0.0071, "reward": 2.2305002212524414, "reward_std": 1.0758438110351562, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4648750126361847, "step": 3081 }, { "completion_length": 135.0, "epoch": 1.6490101658640985, "grad_norm": 2.5005571842193604, "kl": 0.16193699836730957, "learning_rate": 2.503112952677133e-06, "loss": 0.0065, "reward": 1.7866874933242798, "reward_std": 0.6049177050590515, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4429374933242798, "step": 3082 }, { "completion_length": 115.875, "epoch": 1.6495452113429643, "grad_norm": 0.8749712705612183, "kl": 0.2931307554244995, "learning_rate": 2.501556476640226e-06, "loss": 0.0117, "reward": 3.015625, "reward_std": 0.7202834486961365, "rewards/correctness_reward_func": 1.5625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.5, "step": 3083 }, { "completion_length": 144.5, "epoch": 1.65008025682183, "grad_norm": 0.9243875741958618, "kl": 0.13481543958187103, "learning_rate": 2.5e-06, "loss": 0.0054, "reward": 1.8167188167572021, "reward_std": 0.7919143438339233, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.457343727350235, "step": 3084 }, { "completion_length": 133.84375, "epoch": 1.6506153023006955, "grad_norm": 0.5230175852775574, "kl": 0.19566918909549713, "learning_rate": 2.4984435233597735e-06, "loss": 0.0078, "reward": 2.505593776702881, "reward_std": 0.5158241391181946, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.48996874690055847, "step": 3085 }, { "completion_length": 138.6875, "epoch": 1.6511503477795613, "grad_norm": 0.8052110075950623, "kl": 0.16942192614078522, "learning_rate": 2.496887047322868e-06, "loss": 0.0068, "reward": 2.101468801498413, "reward_std": 0.9823170900344849, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4452187418937683, "step": 3086 }, { "completion_length": 133.5625, "epoch": 1.651685393258427, "grad_norm": 1.013598084449768, "kl": 0.17331300675868988, "learning_rate": 2.4953305724926018e-06, "loss": 0.0069, "reward": 2.1946873664855957, "reward_std": 0.8780487179756165, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.49156248569488525, "step": 3087 }, { "completion_length": 138.4375, "epoch": 1.6522204387372925, "grad_norm": 1.2356131076812744, "kl": 0.21678608655929565, "learning_rate": 2.4937740994722953e-06, "loss": 0.0087, "reward": 1.2070624828338623, "reward_std": 0.3829883933067322, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4414375126361847, "step": 3088 }, { "completion_length": 120.5625, "epoch": 1.6527554842161583, "grad_norm": 0.8274263739585876, "kl": 0.19376805424690247, "learning_rate": 2.492217628865267e-06, "loss": 0.0078, "reward": 2.515625, "reward_std": 0.7428563833236694, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.5, "step": 3089 }, { "completion_length": 135.59375, "epoch": 1.653290529695024, "grad_norm": 1.9439501762390137, "kl": 0.15314948558807373, "learning_rate": 2.490661161274835e-06, "loss": 0.0061, "reward": 1.8299686908721924, "reward_std": 1.028785228729248, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.45496875047683716, "step": 3090 }, { "completion_length": 150.5, "epoch": 1.6538255751738897, "grad_norm": 0.9039730429649353, "kl": 0.1223328486084938, "learning_rate": 2.489104697304316e-06, "loss": 0.0049, "reward": 1.70131254196167, "reward_std": 0.3363787829875946, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.43568748235702515, "step": 3091 }, { "completion_length": 124.875, "epoch": 1.6543606206527555, "grad_norm": 1.0890908241271973, "kl": 0.18841376900672913, "learning_rate": 2.487548237557025e-06, "loss": 0.0075, "reward": 2.17759370803833, "reward_std": 1.125465989112854, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.44321876764297485, "step": 3092 }, { "completion_length": 115.78125, "epoch": 1.6548956661316212, "grad_norm": 1.3647624254226685, "kl": 0.1915498971939087, "learning_rate": 2.4859917826362764e-06, "loss": 0.0077, "reward": 2.2536563873291016, "reward_std": 0.9095392227172852, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.488031268119812, "step": 3093 }, { "completion_length": 128.71875, "epoch": 1.655430711610487, "grad_norm": 1.4291894435882568, "kl": 0.1748054325580597, "learning_rate": 2.4844353331453797e-06, "loss": 0.007, "reward": 1.851062536239624, "reward_std": 0.8167535662651062, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47606250643730164, "step": 3094 }, { "completion_length": 155.46875, "epoch": 1.6559657570893527, "grad_norm": 1.152038812637329, "kl": 0.1708592176437378, "learning_rate": 2.4828788896876467e-06, "loss": 0.0068, "reward": 1.3758437633514404, "reward_std": 0.6599516868591309, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.39146873354911804, "step": 3095 }, { "completion_length": 151.84375, "epoch": 1.6565008025682184, "grad_norm": 0.8006393909454346, "kl": 0.17684049904346466, "learning_rate": 2.4813224528663855e-06, "loss": 0.0071, "reward": 1.8953125476837158, "reward_std": 0.9695590734481812, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.42656248807907104, "step": 3096 }, { "completion_length": 124.53125, "epoch": 1.6570358480470841, "grad_norm": 1.197859525680542, "kl": 0.1964918076992035, "learning_rate": 2.479766023284898e-06, "loss": 0.0079, "reward": 1.6067187786102295, "reward_std": 0.8004473447799683, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4660937190055847, "step": 3097 }, { "completion_length": 130.34375, "epoch": 1.6575708935259497, "grad_norm": 1.1299654245376587, "kl": 0.19239360094070435, "learning_rate": 2.4782096015464885e-06, "loss": 0.0077, "reward": 1.3163750171661377, "reward_std": 0.5371227860450745, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4413749873638153, "step": 3098 }, { "completion_length": 127.625, "epoch": 1.6581059390048154, "grad_norm": 1.0142605304718018, "kl": 0.2863227128982544, "learning_rate": 2.4766531882544546e-06, "loss": 0.0115, "reward": 1.59375, "reward_std": 0.5519701242446899, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.453125, "step": 3099 }, { "completion_length": 133.6875, "epoch": 1.6586409844836811, "grad_norm": 0.7826068997383118, "kl": 0.14041632413864136, "learning_rate": 2.475096784012092e-06, "loss": 0.0056, "reward": 1.6734373569488525, "reward_std": 0.8632544279098511, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.4234374761581421, "step": 3100 }, { "completion_length": 139.25, "epoch": 1.6591760299625467, "grad_norm": 1.9713321924209595, "kl": 0.17023965716362, "learning_rate": 2.4735403894226936e-06, "loss": 0.0068, "reward": 2.027031183242798, "reward_std": 0.6815953254699707, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4176562428474426, "step": 3101 }, { "completion_length": 162.96875, "epoch": 1.6597110754414124, "grad_norm": 1.8295584917068481, "kl": 0.2691984474658966, "learning_rate": 2.4719840050895467e-06, "loss": 0.0108, "reward": 1.8084375858306885, "reward_std": 1.049372911453247, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.3865624964237213, "step": 3102 }, { "completion_length": 139.1875, "epoch": 1.6602461209202781, "grad_norm": 0.5970475673675537, "kl": 0.16264599561691284, "learning_rate": 2.4704276316159366e-06, "loss": 0.0065, "reward": 1.5351874828338623, "reward_std": 0.41504159569740295, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4101874828338623, "step": 3103 }, { "completion_length": 135.53125, "epoch": 1.6607811663991439, "grad_norm": 0.7767385840415955, "kl": 0.1678316295146942, "learning_rate": 2.4688712696051437e-06, "loss": 0.0067, "reward": 2.2421875, "reward_std": 0.7811520099639893, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4921875, "step": 3104 }, { "completion_length": 128.5625, "epoch": 1.6613162118780096, "grad_norm": 0.9684968590736389, "kl": 0.14681357145309448, "learning_rate": 2.4673149196604427e-06, "loss": 0.0059, "reward": 1.86328125, "reward_std": 0.7351405620574951, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48828125, "step": 3105 }, { "completion_length": 130.5625, "epoch": 1.6618512573568753, "grad_norm": 0.9986547827720642, "kl": 0.16016599535942078, "learning_rate": 2.4657585823851053e-06, "loss": 0.0064, "reward": 2.10365629196167, "reward_std": 0.8799132704734802, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.44740623235702515, "step": 3106 }, { "completion_length": 118.71875, "epoch": 1.662386302835741, "grad_norm": 0.6609882116317749, "kl": 0.17879942059516907, "learning_rate": 2.4642022583823972e-06, "loss": 0.0072, "reward": 2.0384063720703125, "reward_std": 0.8533704280853271, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.49153125286102295, "step": 3107 }, { "completion_length": 127.9375, "epoch": 1.6629213483146068, "grad_norm": 0.40359196066856384, "kl": 0.14699991047382355, "learning_rate": 2.4626459482555797e-06, "loss": 0.0059, "reward": 2.29296875, "reward_std": 0.3470717668533325, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.46484375, "step": 3108 }, { "completion_length": 146.1875, "epoch": 1.6634563937934725, "grad_norm": 1.9311796426773071, "kl": 0.1797364056110382, "learning_rate": 2.4610896526079094e-06, "loss": 0.0072, "reward": 1.9695937633514404, "reward_std": 0.7189245223999023, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.45396876335144043, "step": 3109 }, { "completion_length": 119.15625, "epoch": 1.6639914392723383, "grad_norm": 0.5718482732772827, "kl": 0.18878301978111267, "learning_rate": 2.4595333720426344e-06, "loss": 0.0076, "reward": 2.4361250400543213, "reward_std": 0.9495420455932617, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4673749804496765, "step": 3110 }, { "completion_length": 135.3125, "epoch": 1.664526484751204, "grad_norm": 0.47203031182289124, "kl": 0.14992254972457886, "learning_rate": 2.4579771071630006e-06, "loss": 0.006, "reward": 2.2852187156677246, "reward_std": 0.7419458031654358, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.457093745470047, "step": 3111 }, { "completion_length": 130.15625, "epoch": 1.6650615302300695, "grad_norm": 1.429177165031433, "kl": 0.16928517818450928, "learning_rate": 2.4564208585722444e-06, "loss": 0.0068, "reward": 1.9089374542236328, "reward_std": 0.37156569957733154, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4714375138282776, "step": 3112 }, { "completion_length": 114.875, "epoch": 1.6655965757089353, "grad_norm": 15.316442489624023, "kl": 1.1924676895141602, "learning_rate": 2.4548646268735987e-06, "loss": 0.0477, "reward": 2.018749952316284, "reward_std": 0.5563600063323975, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48750001192092896, "step": 3113 }, { "completion_length": 153.40625, "epoch": 1.666131621187801, "grad_norm": 9.409712791442871, "kl": 1.1301255226135254, "learning_rate": 2.4533084126702893e-06, "loss": 0.0452, "reward": 1.494937539100647, "reward_std": 0.6184190511703491, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4011875092983246, "step": 3114 }, { "completion_length": 125.46875, "epoch": 1.6666666666666665, "grad_norm": 1.0158181190490723, "kl": 0.16419219970703125, "learning_rate": 2.4517522165655333e-06, "loss": 0.0066, "reward": 2.739062547683716, "reward_std": 0.5662603378295898, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48906248807907104, "step": 3115 }, { "completion_length": 121.15625, "epoch": 1.6672017121455323, "grad_norm": 3.1509506702423096, "kl": 0.4616633951663971, "learning_rate": 2.450196039162544e-06, "loss": 0.0185, "reward": 1.7265625, "reward_std": 0.7788664698600769, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4765625, "step": 3116 }, { "completion_length": 127.34375, "epoch": 1.667736757624398, "grad_norm": 1.1231811046600342, "kl": 0.15880393981933594, "learning_rate": 2.448639881064522e-06, "loss": 0.0064, "reward": 2.6716561317443848, "reward_std": 0.7434578537940979, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4841562509536743, "step": 3117 }, { "completion_length": 138.625, "epoch": 1.6682718031032637, "grad_norm": 0.9287656545639038, "kl": 0.18957854807376862, "learning_rate": 2.4470837428746673e-06, "loss": 0.0076, "reward": 1.9040000438690186, "reward_std": 0.7677388191223145, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4196249842643738, "step": 3118 }, { "completion_length": 123.1875, "epoch": 1.6688068485821295, "grad_norm": 3.49595308303833, "kl": 0.48520949482917786, "learning_rate": 2.445527625196169e-06, "loss": 0.0194, "reward": 2.290187358856201, "reward_std": 0.8890236616134644, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4933124780654907, "step": 3119 }, { "completion_length": 147.21875, "epoch": 1.6693418940609952, "grad_norm": 1.4286279678344727, "kl": 0.1600649654865265, "learning_rate": 2.443971528632205e-06, "loss": 0.0064, "reward": 1.6424999237060547, "reward_std": 0.7627279162406921, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.42374998331069946, "step": 3120 }, { "completion_length": 134.03125, "epoch": 1.669876939539861, "grad_norm": 1.3642139434814453, "kl": 0.17627114057540894, "learning_rate": 2.442415453785952e-06, "loss": 0.0071, "reward": 2.13100004196167, "reward_std": 0.7367475032806396, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.47474998235702515, "step": 3121 }, { "completion_length": 143.875, "epoch": 1.6704119850187267, "grad_norm": 0.5681651830673218, "kl": 0.16788606345653534, "learning_rate": 2.4408594012605717e-06, "loss": 0.0067, "reward": 2.2652499675750732, "reward_std": 0.908000111579895, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46837499737739563, "step": 3122 }, { "completion_length": 123.1875, "epoch": 1.6709470304975924, "grad_norm": 0.6893980503082275, "kl": 0.17295421659946442, "learning_rate": 2.43930337165922e-06, "loss": 0.0069, "reward": 2.5625, "reward_std": 0.4918614327907562, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.5, "step": 3123 }, { "completion_length": 164.71875, "epoch": 1.6714820759764581, "grad_norm": 0.8803556561470032, "kl": 0.14482924342155457, "learning_rate": 2.4377473655850462e-06, "loss": 0.0058, "reward": 1.7005937099456787, "reward_std": 0.7490816116333008, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.3724687695503235, "step": 3124 }, { "completion_length": 117.8125, "epoch": 1.6720171214553237, "grad_norm": 0.811690092086792, "kl": 0.2383955717086792, "learning_rate": 2.436191383641185e-06, "loss": 0.0095, "reward": 2.5638749599456787, "reward_std": 1.035881519317627, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4701250195503235, "step": 3125 }, { "completion_length": 152.71875, "epoch": 1.6725521669341894, "grad_norm": 2.8319616317749023, "kl": 0.2390364408493042, "learning_rate": 2.4346354264307674e-06, "loss": 0.0096, "reward": 1.4370625019073486, "reward_std": 0.6907188296318054, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.25, "rewards/xmlcount_reward_func": 0.32768750190734863, "step": 3126 }, { "completion_length": 132.0625, "epoch": 1.6730872124130551, "grad_norm": 1.1526427268981934, "kl": 0.17280298471450806, "learning_rate": 2.43307949455691e-06, "loss": 0.0069, "reward": 1.953125, "reward_std": 0.4534076154232025, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.5, "step": 3127 }, { "completion_length": 138.84375, "epoch": 1.6736222578919207, "grad_norm": 4.820899486541748, "kl": 0.18445232510566711, "learning_rate": 2.4315235886227233e-06, "loss": 0.0074, "reward": 1.6343125104904175, "reward_std": 0.5018917322158813, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4780625104904175, "step": 3128 }, { "completion_length": 119.15625, "epoch": 1.6741573033707864, "grad_norm": 0.7034662365913391, "kl": 0.18479031324386597, "learning_rate": 2.429967709231306e-06, "loss": 0.0074, "reward": 2.151937484741211, "reward_std": 0.7450258135795593, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.48006248474121094, "step": 3129 }, { "completion_length": 133.0, "epoch": 1.6746923488496521, "grad_norm": 1.8574870824813843, "kl": 0.2597821056842804, "learning_rate": 2.4284118569857466e-06, "loss": 0.0104, "reward": 2.5931248664855957, "reward_std": 0.6118266582489014, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48374998569488525, "step": 3130 }, { "completion_length": 143.03125, "epoch": 1.6752273943285179, "grad_norm": 0.6762158870697021, "kl": 0.1497369259595871, "learning_rate": 2.4268560324891236e-06, "loss": 0.006, "reward": 2.176187515258789, "reward_std": 0.6238462328910828, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.45743751525878906, "step": 3131 }, { "completion_length": 105.25, "epoch": 1.6757624398073836, "grad_norm": 32.7228889465332, "kl": 0.25778645277023315, "learning_rate": 2.4253002363445034e-06, "loss": 0.0103, "reward": 2.5, "reward_std": 0.7861415147781372, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.5, "step": 3132 }, { "completion_length": 136.15625, "epoch": 1.6762974852862493, "grad_norm": 0.8895025253295898, "kl": 0.20957006514072418, "learning_rate": 2.4237444691549427e-06, "loss": 0.0084, "reward": 2.1332499980926514, "reward_std": 1.0631967782974243, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47699999809265137, "step": 3133 }, { "completion_length": 118.21875, "epoch": 1.676832530765115, "grad_norm": 23.40359878540039, "kl": 1.2906748056411743, "learning_rate": 2.422188731523488e-06, "loss": 0.0516, "reward": 2.21875, "reward_std": 0.4704121947288513, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.5, "step": 3134 }, { "completion_length": 128.90625, "epoch": 1.6773675762439808, "grad_norm": 1.3291186094284058, "kl": 0.18724629282951355, "learning_rate": 2.4206330240531713e-06, "loss": 0.0075, "reward": 2.3022499084472656, "reward_std": 0.9196746349334717, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4584999978542328, "step": 3135 }, { "completion_length": 111.28125, "epoch": 1.6779026217228465, "grad_norm": 1.6532827615737915, "kl": 0.20336811244487762, "learning_rate": 2.4190773473470166e-06, "loss": 0.0081, "reward": 1.6877813339233398, "reward_std": 0.7687060832977295, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4690312445163727, "step": 3136 }, { "completion_length": 150.53125, "epoch": 1.6784376672017123, "grad_norm": 0.514609158039093, "kl": 0.14583277702331543, "learning_rate": 2.417521702008031e-06, "loss": 0.0058, "reward": 1.9193437099456787, "reward_std": 1.0341728925704956, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4505937397480011, "step": 3137 }, { "completion_length": 117.25, "epoch": 1.6789727126805778, "grad_norm": 337669.5, "kl": 2898.6064453125, "learning_rate": 2.4159660886392147e-06, "loss": 115.9443, "reward": 2.4828124046325684, "reward_std": 0.43765559792518616, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4828125238418579, "step": 3138 }, { "completion_length": 127.125, "epoch": 1.6795077581594435, "grad_norm": 1.101532220840454, "kl": 0.17423218488693237, "learning_rate": 2.414410507843554e-06, "loss": 0.007, "reward": 1.83984375, "reward_std": 0.5093922019004822, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.49609375, "step": 3139 }, { "completion_length": 121.9375, "epoch": 1.6800428036383093, "grad_norm": 0.8400896191596985, "kl": 0.19125762581825256, "learning_rate": 2.4128549602240185e-06, "loss": 0.0077, "reward": 2.418156147003174, "reward_std": 1.0685129165649414, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4806562662124634, "step": 3140 }, { "completion_length": 114.09375, "epoch": 1.680577849117175, "grad_norm": 1.4671980142593384, "kl": 0.17802348732948303, "learning_rate": 2.4112994463835717e-06, "loss": 0.0071, "reward": 2.1728436946868896, "reward_std": 0.7289335131645203, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4853437542915344, "step": 3141 }, { "completion_length": 109.34375, "epoch": 1.6811128945960405, "grad_norm": 20100.2421875, "kl": 89.0874252319336, "learning_rate": 2.4097439669251595e-06, "loss": 3.5635, "reward": 2.5732500553131104, "reward_std": 1.0542243719100952, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4951249957084656, "step": 3142 }, { "completion_length": 119.96875, "epoch": 1.6816479400749063, "grad_norm": 0.8202652931213379, "kl": 0.16857844591140747, "learning_rate": 2.408188522451714e-06, "loss": 0.0067, "reward": 2.1051249504089355, "reward_std": 0.9050133228302002, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4488750100135803, "step": 3143 }, { "completion_length": 130.78125, "epoch": 1.682182985553772, "grad_norm": 0.7854466438293457, "kl": 0.1878761351108551, "learning_rate": 2.4066331135661582e-06, "loss": 0.0075, "reward": 1.3899062871932983, "reward_std": 0.45890453457832336, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.46803125739097595, "step": 3144 }, { "completion_length": 117.09375, "epoch": 1.6827180310326377, "grad_norm": 1.5921783447265625, "kl": 0.1919291913509369, "learning_rate": 2.405077740871396e-06, "loss": 0.0077, "reward": 1.8494999408721924, "reward_std": 0.7833782434463501, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47450000047683716, "step": 3145 }, { "completion_length": 136.625, "epoch": 1.6832530765115035, "grad_norm": 1.0505073070526123, "kl": 0.1730906367301941, "learning_rate": 2.403522404970319e-06, "loss": 0.0069, "reward": 2.2623438835144043, "reward_std": 0.9926912188529968, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.43421876430511475, "step": 3146 }, { "completion_length": 141.4375, "epoch": 1.6837881219903692, "grad_norm": 3.036771059036255, "kl": 0.5293207168579102, "learning_rate": 2.4019671064658088e-06, "loss": 0.0212, "reward": 2.159843683242798, "reward_std": 1.0133321285247803, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4567187428474426, "step": 3147 }, { "completion_length": 111.53125, "epoch": 1.684323167469235, "grad_norm": 0.8587340712547302, "kl": 0.18759144842624664, "learning_rate": 2.4004118459607255e-06, "loss": 0.0075, "reward": 3.21875, "reward_std": 0.5381704568862915, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.5, "step": 3148 }, { "completion_length": 116.5625, "epoch": 1.6848582129481007, "grad_norm": 0.47986850142478943, "kl": 0.20435559749603271, "learning_rate": 2.3988566240579195e-06, "loss": 0.0082, "reward": 2.08203125, "reward_std": 0.3529890179634094, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.48828125, "step": 3149 }, { "completion_length": 138.46875, "epoch": 1.6853932584269664, "grad_norm": 221.429443359375, "kl": 1.6696515083312988, "learning_rate": 2.397301441360224e-06, "loss": 0.0668, "reward": 2.279562473297119, "reward_std": 0.7262374758720398, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.45143750309944153, "step": 3150 }, { "completion_length": 145.25, "epoch": 1.6859283039058321, "grad_norm": 0.8724892735481262, "kl": 0.15660065412521362, "learning_rate": 2.3957462984704576e-06, "loss": 0.0063, "reward": 2.3425936698913574, "reward_std": 1.1597763299942017, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4363437592983246, "step": 3151 }, { "completion_length": 129.21875, "epoch": 1.6864633493846977, "grad_norm": 0.8522737622261047, "kl": 0.21531125903129578, "learning_rate": 2.394191195991424e-06, "loss": 0.0086, "reward": 2.37890625, "reward_std": 0.6510730981826782, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48828125, "step": 3152 }, { "completion_length": 139.40625, "epoch": 1.6869983948635634, "grad_norm": 0.8439301252365112, "kl": 0.18935734033584595, "learning_rate": 2.39263613452591e-06, "loss": 0.0076, "reward": 2.666874885559082, "reward_std": 1.1275674104690552, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4637500047683716, "step": 3153 }, { "completion_length": 114.75, "epoch": 1.6875334403424291, "grad_norm": 0.9749462604522705, "kl": 0.20384399592876434, "learning_rate": 2.3910811146766884e-06, "loss": 0.0082, "reward": 2.4787187576293945, "reward_std": 0.9325612783432007, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47871875762939453, "step": 3154 }, { "completion_length": 138.65625, "epoch": 1.6880684858212947, "grad_norm": 0.6495341062545776, "kl": 0.1557432860136032, "learning_rate": 2.389526137046513e-06, "loss": 0.0062, "reward": 2.094062566757202, "reward_std": 0.8035300970077515, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4690625071525574, "step": 3155 }, { "completion_length": 141.40625, "epoch": 1.6886035313001604, "grad_norm": 0.7029533982276917, "kl": 0.13363146781921387, "learning_rate": 2.387971202238124e-06, "loss": 0.0053, "reward": 1.7990624904632568, "reward_std": 1.0457425117492676, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.43968749046325684, "step": 3156 }, { "completion_length": 136.1875, "epoch": 1.6891385767790261, "grad_norm": 0.599409818649292, "kl": 0.19657546281814575, "learning_rate": 2.3864163108542444e-06, "loss": 0.0079, "reward": 1.919624924659729, "reward_std": 0.8915787935256958, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.45087501406669617, "step": 3157 }, { "completion_length": 119.53125, "epoch": 1.6896736222578919, "grad_norm": 1.209818959236145, "kl": 0.16703394055366516, "learning_rate": 2.384861463497579e-06, "loss": 0.0067, "reward": 3.210625171661377, "reward_std": 0.5561158061027527, "rewards/correctness_reward_func": 1.8125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4762499928474426, "step": 3158 }, { "completion_length": 115.625, "epoch": 1.6902086677367576, "grad_norm": 11.70821475982666, "kl": 1.2697656154632568, "learning_rate": 2.383306660770817e-06, "loss": 0.0508, "reward": 2.7145938873291016, "reward_std": 0.7787563800811768, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.480218768119812, "step": 3159 }, { "completion_length": 113.125, "epoch": 1.6907437132156233, "grad_norm": 1.2619261741638184, "kl": 0.2295560985803604, "learning_rate": 2.3817519032766286e-06, "loss": 0.0092, "reward": 2.1892499923706055, "reward_std": 0.7174574136734009, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47049999237060547, "step": 3160 }, { "completion_length": 138.75, "epoch": 1.691278758694489, "grad_norm": 0.7898891568183899, "kl": 0.20091909170150757, "learning_rate": 2.3801971916176685e-06, "loss": 0.008, "reward": 2.4009687900543213, "reward_std": 0.8524676561355591, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4634687304496765, "step": 3161 }, { "completion_length": 115.375, "epoch": 1.6918138041733548, "grad_norm": 3.2765567302703857, "kl": 0.6335122585296631, "learning_rate": 2.378642526396574e-06, "loss": 0.0253, "reward": 2.1831562519073486, "reward_std": 0.7706515192985535, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.40190625190734863, "step": 3162 }, { "completion_length": 138.25, "epoch": 1.6923488496522205, "grad_norm": 3.118790626525879, "kl": 0.16302277147769928, "learning_rate": 2.37708790821596e-06, "loss": 0.0065, "reward": 1.5583751201629639, "reward_std": 0.4113177955150604, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.3708750009536743, "step": 3163 }, { "completion_length": 139.65625, "epoch": 1.6928838951310863, "grad_norm": 3.7781810760498047, "kl": 0.633012056350708, "learning_rate": 2.37553333767843e-06, "loss": 0.0253, "reward": 1.8158750534057617, "reward_std": 1.031606674194336, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.265625, "rewards/xmlcount_reward_func": 0.42524999380111694, "step": 3164 }, { "completion_length": 135.5625, "epoch": 1.6934189406099518, "grad_norm": 1.3501086235046387, "kl": 0.1928791105747223, "learning_rate": 2.373978815386562e-06, "loss": 0.0077, "reward": 2.274937629699707, "reward_std": 0.6778589487075806, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4311874806880951, "step": 3165 }, { "completion_length": 133.9375, "epoch": 1.6939539860888175, "grad_norm": 1.00400710105896, "kl": 0.15710897743701935, "learning_rate": 2.3724243419429197e-06, "loss": 0.0063, "reward": 1.917312502861023, "reward_std": 1.0171279907226562, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.44856250286102295, "step": 3166 }, { "completion_length": 125.03125, "epoch": 1.6944890315676833, "grad_norm": 0.771248459815979, "kl": 0.19324347376823425, "learning_rate": 2.3708699179500487e-06, "loss": 0.0077, "reward": 1.781156301498413, "reward_std": 0.6131486892700195, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4217812418937683, "step": 3167 }, { "completion_length": 115.65625, "epoch": 1.695024077046549, "grad_norm": 2.377681016921997, "kl": 0.34609007835388184, "learning_rate": 2.369315544010471e-06, "loss": 0.0138, "reward": 2.4513750076293945, "reward_std": 0.8661600351333618, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.48262500762939453, "step": 3168 }, { "completion_length": 159.375, "epoch": 1.6955591225254145, "grad_norm": 0.9749809503555298, "kl": 0.15495318174362183, "learning_rate": 2.3677612207266932e-06, "loss": 0.0062, "reward": 1.3558437824249268, "reward_std": 0.8422086238861084, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.355843722820282, "step": 3169 }, { "completion_length": 132.625, "epoch": 1.6960941680042803, "grad_norm": 0.9090114235877991, "kl": 0.15859797596931458, "learning_rate": 2.3662069487011994e-06, "loss": 0.0063, "reward": 2.552687644958496, "reward_std": 0.8602401614189148, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.47456252574920654, "step": 3170 }, { "completion_length": 153.1875, "epoch": 1.696629213483146, "grad_norm": 0.575765073299408, "kl": 0.12599946558475494, "learning_rate": 2.3646527285364565e-06, "loss": 0.005, "reward": 1.8220937252044678, "reward_std": 1.0765037536621094, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4158437252044678, "step": 3171 }, { "completion_length": 120.71875, "epoch": 1.6971642589620117, "grad_norm": 1.2423707246780396, "kl": 0.17838698625564575, "learning_rate": 2.36309856083491e-06, "loss": 0.0071, "reward": 2.031843662261963, "reward_std": 0.8800145983695984, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48496875166893005, "step": 3172 }, { "completion_length": 138.3125, "epoch": 1.6976993044408775, "grad_norm": 1.5636112689971924, "kl": 0.3221181631088257, "learning_rate": 2.3615444461989844e-06, "loss": 0.0129, "reward": 1.9158437252044678, "reward_std": 0.6913692951202393, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46271875500679016, "step": 3173 }, { "completion_length": 134.46875, "epoch": 1.6982343499197432, "grad_norm": 1.095141053199768, "kl": 0.18491460382938385, "learning_rate": 2.359990385231084e-06, "loss": 0.0074, "reward": 2.224874973297119, "reward_std": 0.6358193755149841, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.45925000309944153, "step": 3174 }, { "completion_length": 133.3125, "epoch": 1.698769395398609, "grad_norm": 0.7541701197624207, "kl": 0.17949026823043823, "learning_rate": 2.3584363785335945e-06, "loss": 0.0072, "reward": 1.8853750228881836, "reward_std": 0.9539765119552612, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4166249930858612, "step": 3175 }, { "completion_length": 135.25, "epoch": 1.6993044408774747, "grad_norm": 0.6893708109855652, "kl": 0.11504117399454117, "learning_rate": 2.3568824267088766e-06, "loss": 0.0046, "reward": 1.586343765258789, "reward_std": 0.8657718896865845, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.41446876525878906, "step": 3176 }, { "completion_length": 126.4375, "epoch": 1.6998394863563404, "grad_norm": 0.8380654454231262, "kl": 0.13609960675239563, "learning_rate": 2.3553285303592733e-06, "loss": 0.0054, "reward": 2.108281135559082, "reward_std": 0.45358896255493164, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4832812547683716, "step": 3177 }, { "completion_length": 140.34375, "epoch": 1.7003745318352061, "grad_norm": 1.2048155069351196, "kl": 0.1580064296722412, "learning_rate": 2.353774690087103e-06, "loss": 0.0063, "reward": 1.4948437213897705, "reward_std": 0.7653907537460327, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4010937511920929, "step": 3178 }, { "completion_length": 144.5625, "epoch": 1.7009095773140717, "grad_norm": 1.4228593111038208, "kl": 0.14947275817394257, "learning_rate": 2.352220906494665e-06, "loss": 0.006, "reward": 1.4721250534057617, "reward_std": 0.7148675918579102, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.45650002360343933, "step": 3179 }, { "completion_length": 106.53125, "epoch": 1.7014446227929374, "grad_norm": 1.5835704803466797, "kl": 0.2226097136735916, "learning_rate": 2.3506671801842366e-06, "loss": 0.0089, "reward": 2.637531280517578, "reward_std": 0.6618923544883728, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48128125071525574, "step": 3180 }, { "completion_length": 130.71875, "epoch": 1.7019796682718031, "grad_norm": 1.9119704961776733, "kl": 0.16760829091072083, "learning_rate": 2.3491135117580696e-06, "loss": 0.0067, "reward": 1.7889374494552612, "reward_std": 0.7230865955352783, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4608124792575836, "step": 3181 }, { "completion_length": 117.1875, "epoch": 1.7025147137506687, "grad_norm": 1.0533874034881592, "kl": 0.20371517539024353, "learning_rate": 2.3475599018183976e-06, "loss": 0.0081, "reward": 2.3522186279296875, "reward_std": 0.7878820896148682, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47721874713897705, "step": 3182 }, { "completion_length": 134.71875, "epoch": 1.7030497592295344, "grad_norm": 0.8566257357597351, "kl": 0.1804002821445465, "learning_rate": 2.346006350967428e-06, "loss": 0.0072, "reward": 2.4581875801086426, "reward_std": 0.8490393161773682, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4269375205039978, "step": 3183 }, { "completion_length": 114.09375, "epoch": 1.7035848047084001, "grad_norm": 0.6777130961418152, "kl": 0.2494317889213562, "learning_rate": 2.344452859807348e-06, "loss": 0.01, "reward": 2.233656406402588, "reward_std": 0.700225293636322, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48365625739097595, "step": 3184 }, { "completion_length": 130.53125, "epoch": 1.7041198501872659, "grad_norm": 0.7708522081375122, "kl": 0.12337937951087952, "learning_rate": 2.3428994289403206e-06, "loss": 0.0049, "reward": 2.793968677520752, "reward_std": 0.586874783039093, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4658437669277191, "step": 3185 }, { "completion_length": 144.375, "epoch": 1.7046548956661316, "grad_norm": 0.6248936057090759, "kl": 0.136185884475708, "learning_rate": 2.3413460589684843e-06, "loss": 0.0054, "reward": 1.4765937328338623, "reward_std": 0.7952174544334412, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4297187328338623, "step": 3186 }, { "completion_length": 115.375, "epoch": 1.7051899411449973, "grad_norm": 1.220821738243103, "kl": 0.18804770708084106, "learning_rate": 2.339792750493957e-06, "loss": 0.0075, "reward": 3.14453125, "reward_std": 0.6580443382263184, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48828125, "step": 3187 }, { "completion_length": 132.9375, "epoch": 1.705724986623863, "grad_norm": 2.8163299560546875, "kl": 0.15951532125473022, "learning_rate": 2.338239504118829e-06, "loss": 0.0064, "reward": 1.9035313129425049, "reward_std": 0.8142567873001099, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4504062235355377, "step": 3188 }, { "completion_length": 111.59375, "epoch": 1.7062600321027288, "grad_norm": 2.0575978755950928, "kl": 0.18123501539230347, "learning_rate": 2.336686320445168e-06, "loss": 0.0072, "reward": 1.977156162261963, "reward_std": 0.8063124418258667, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47715625166893005, "step": 3189 }, { "completion_length": 134.46875, "epoch": 1.7067950775815945, "grad_norm": 9.927448272705078, "kl": 0.46287646889686584, "learning_rate": 2.33513320007502e-06, "loss": 0.0185, "reward": 2.001406192779541, "reward_std": 0.8143668174743652, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4545312523841858, "step": 3190 }, { "completion_length": 132.0625, "epoch": 1.7073301230604603, "grad_norm": 1.1493465900421143, "kl": 0.1646169126033783, "learning_rate": 2.333580143610402e-06, "loss": 0.0066, "reward": 1.9869375228881836, "reward_std": 0.928698718547821, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4400625228881836, "step": 3191 }, { "completion_length": 138.5625, "epoch": 1.7078651685393258, "grad_norm": 24.783674240112305, "kl": 5.971104621887207, "learning_rate": 2.3320271516533104e-06, "loss": 0.2388, "reward": 1.702625036239624, "reward_std": 0.9705759286880493, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.43699997663497925, "step": 3192 }, { "completion_length": 115.5, "epoch": 1.7084002140181915, "grad_norm": 3.5597548484802246, "kl": 0.22137150168418884, "learning_rate": 2.3304742248057126e-06, "loss": 0.0089, "reward": 1.3613437414169312, "reward_std": 0.7110645174980164, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.45509374141693115, "step": 3193 }, { "completion_length": 113.4375, "epoch": 1.7089352594970573, "grad_norm": 0.8851496577262878, "kl": 0.17942382395267487, "learning_rate": 2.328921363669554e-06, "loss": 0.0072, "reward": 2.374875068664551, "reward_std": 0.8332825303077698, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.499875009059906, "step": 3194 }, { "completion_length": 128.875, "epoch": 1.709470304975923, "grad_norm": 2.5097362995147705, "kl": 0.20429867506027222, "learning_rate": 2.327368568846754e-06, "loss": 0.0082, "reward": 2.043656349182129, "reward_std": 1.0046048164367676, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.44990625977516174, "step": 3195 }, { "completion_length": 150.6875, "epoch": 1.7100053504547885, "grad_norm": 1.5355446338653564, "kl": 0.14073960483074188, "learning_rate": 2.325815840939204e-06, "loss": 0.0056, "reward": 1.605375051498413, "reward_std": 0.6648403406143188, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4334999918937683, "step": 3196 }, { "completion_length": 127.375, "epoch": 1.7105403959336543, "grad_norm": 17.9588623046875, "kl": 0.20722103118896484, "learning_rate": 2.324263180548772e-06, "loss": 0.0083, "reward": 2.1818125247955322, "reward_std": 0.5530619025230408, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47868749499320984, "step": 3197 }, { "completion_length": 124.40625, "epoch": 1.71107544141252, "grad_norm": 0.7876779437065125, "kl": 0.1783183515071869, "learning_rate": 2.3227105882772988e-06, "loss": 0.0071, "reward": 1.921875, "reward_std": 0.22097086906433105, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 3198 }, { "completion_length": 150.5625, "epoch": 1.7116104868913857, "grad_norm": 0.8962150812149048, "kl": 0.13449187576770782, "learning_rate": 2.3211580647265987e-06, "loss": 0.0054, "reward": 1.80859375, "reward_std": 0.6079303026199341, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.46484375, "step": 3199 }, { "completion_length": 123.28125, "epoch": 1.7121455323702515, "grad_norm": 1.288654088973999, "kl": 0.2950373888015747, "learning_rate": 2.3196056104984603e-06, "loss": 0.0118, "reward": 2.169468879699707, "reward_std": 1.082622766494751, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4663437604904175, "step": 3200 }, { "completion_length": 150.65625, "epoch": 1.7126805778491172, "grad_norm": 0.8837084770202637, "kl": 0.1546267718076706, "learning_rate": 2.3180532261946438e-06, "loss": 0.0062, "reward": 1.3741874694824219, "reward_std": 0.6737341284751892, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.42106249928474426, "step": 3201 }, { "completion_length": 138.9375, "epoch": 1.713215623327983, "grad_norm": 0.6251791715621948, "kl": 0.15133056044578552, "learning_rate": 2.3165009124168835e-06, "loss": 0.0061, "reward": 2.1413750648498535, "reward_std": 0.6060399413108826, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48512500524520874, "step": 3202 }, { "completion_length": 146.34375, "epoch": 1.7137506688068487, "grad_norm": 1.090428352355957, "kl": 0.1519504189491272, "learning_rate": 2.3149486697668853e-06, "loss": 0.0061, "reward": 2.046062469482422, "reward_std": 1.066632628440857, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.45231249928474426, "step": 3203 }, { "completion_length": 123.84375, "epoch": 1.7142857142857144, "grad_norm": 1.5349928140640259, "kl": 0.15876558423042297, "learning_rate": 2.313396498846329e-06, "loss": 0.0064, "reward": 2.15625, "reward_std": 1.0349699258804321, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.5, "step": 3204 }, { "completion_length": 144.625, "epoch": 1.7148207597645801, "grad_norm": 0.4907825291156769, "kl": 0.15177476406097412, "learning_rate": 2.3118444002568663e-06, "loss": 0.0061, "reward": 1.6531875133514404, "reward_std": 0.7021917700767517, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.43443751335144043, "step": 3205 }, { "completion_length": 131.0, "epoch": 1.7153558052434457, "grad_norm": 0.7118580341339111, "kl": 0.15233546495437622, "learning_rate": 2.3102923746001192e-06, "loss": 0.0061, "reward": 1.6332499980926514, "reward_std": 0.8829143047332764, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.43012499809265137, "step": 3206 }, { "completion_length": 119.6875, "epoch": 1.7158908507223114, "grad_norm": 0.6771844029426575, "kl": 0.21440072357654572, "learning_rate": 2.3087404224776837e-06, "loss": 0.0086, "reward": 2.3936874866485596, "reward_std": 0.7197636365890503, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.45618748664855957, "step": 3207 }, { "completion_length": 142.125, "epoch": 1.7164258962011771, "grad_norm": 2.0356369018554688, "kl": 0.13744686543941498, "learning_rate": 2.3071885444911237e-06, "loss": 0.0055, "reward": 1.9780625104904175, "reward_std": 0.7967649102210999, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4624375104904175, "step": 3208 }, { "completion_length": 134.0, "epoch": 1.7169609416800427, "grad_norm": 0.9632256627082825, "kl": 0.19368067383766174, "learning_rate": 2.3056367412419793e-06, "loss": 0.0077, "reward": 2.3318748474121094, "reward_std": 0.8514953851699829, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4724999964237213, "step": 3209 }, { "completion_length": 117.625, "epoch": 1.7174959871589084, "grad_norm": 1.189895749092102, "kl": 0.22652611136436462, "learning_rate": 2.3040850133317598e-06, "loss": 0.0091, "reward": 2.5262813568115234, "reward_std": 0.7249923348426819, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4794062376022339, "step": 3210 }, { "completion_length": 114.71875, "epoch": 1.7180310326377741, "grad_norm": 6.434771537780762, "kl": 0.37854233384132385, "learning_rate": 2.302533361361942e-06, "loss": 0.0151, "reward": 2.660968780517578, "reward_std": 0.6558390259742737, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.48909375071525574, "step": 3211 }, { "completion_length": 135.5, "epoch": 1.7185660781166399, "grad_norm": 1.1774325370788574, "kl": 0.18648913502693176, "learning_rate": 2.3009817859339775e-06, "loss": 0.0075, "reward": 2.2508749961853027, "reward_std": 0.9736047983169556, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.46962499618530273, "step": 3212 }, { "completion_length": 146.46875, "epoch": 1.7191011235955056, "grad_norm": 1.368481993675232, "kl": 0.17930004000663757, "learning_rate": 2.2994302876492874e-06, "loss": 0.0072, "reward": 2.1709063053131104, "reward_std": 1.064078450202942, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4521562457084656, "step": 3213 }, { "completion_length": 158.25, "epoch": 1.7196361690743713, "grad_norm": 1.5913784503936768, "kl": 0.2054295837879181, "learning_rate": 2.29787886710926e-06, "loss": 0.0082, "reward": 1.30078125, "reward_std": 0.7431464195251465, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.42578125, "step": 3214 }, { "completion_length": 124.875, "epoch": 1.720171214553237, "grad_norm": 1.180896282196045, "kl": 0.15731939673423767, "learning_rate": 2.296327524915258e-06, "loss": 0.0063, "reward": 2.152156352996826, "reward_std": 0.5490114092826843, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4802812337875366, "step": 3215 }, { "completion_length": 135.03125, "epoch": 1.7207062600321028, "grad_norm": 1.2502082586288452, "kl": 0.15604281425476074, "learning_rate": 2.2947762616686095e-06, "loss": 0.0062, "reward": 2.165562629699707, "reward_std": 0.506086528301239, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4780625104904175, "step": 3216 }, { "completion_length": 105.09375, "epoch": 1.7212413055109685, "grad_norm": 0.7539623975753784, "kl": 0.22658345103263855, "learning_rate": 2.2932250779706148e-06, "loss": 0.0091, "reward": 2.828125, "reward_std": 0.9870055913925171, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 3217 }, { "completion_length": 138.8125, "epoch": 1.7217763509898343, "grad_norm": 1.4991577863693237, "kl": 0.21453829109668732, "learning_rate": 2.2916739744225426e-06, "loss": 0.0086, "reward": 1.9210937023162842, "reward_std": 0.8245946168899536, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.45234376192092896, "step": 3218 }, { "completion_length": 137.09375, "epoch": 1.7223113964686998, "grad_norm": 0.6802608966827393, "kl": 0.14859840273857117, "learning_rate": 2.290122951625629e-06, "loss": 0.0059, "reward": 1.773937463760376, "reward_std": 0.6811206340789795, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46143752336502075, "step": 3219 }, { "completion_length": 124.0, "epoch": 1.7228464419475655, "grad_norm": 0.9443618655204773, "kl": 0.13735529780387878, "learning_rate": 2.288572010181082e-06, "loss": 0.0055, "reward": 2.882999897003174, "reward_std": 0.576507031917572, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4923750162124634, "step": 3220 }, { "completion_length": 137.90625, "epoch": 1.7233814874264313, "grad_norm": 1.102286696434021, "kl": 0.1694003790616989, "learning_rate": 2.287021150690075e-06, "loss": 0.0068, "reward": 2.282437562942505, "reward_std": 0.8924881219863892, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4699375033378601, "step": 3221 }, { "completion_length": 119.5625, "epoch": 1.723916532905297, "grad_norm": 0.9995924830436707, "kl": 0.18513508141040802, "learning_rate": 2.2854703737537513e-06, "loss": 0.0074, "reward": 2.267031192779541, "reward_std": 0.8082706332206726, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4701562523841858, "step": 3222 }, { "completion_length": 134.46875, "epoch": 1.7244515783841625, "grad_norm": 0.712518572807312, "kl": 0.15135368704795837, "learning_rate": 2.283919679973222e-06, "loss": 0.0061, "reward": 2.5390625, "reward_std": 0.865066647529602, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4765625, "step": 3223 }, { "completion_length": 124.5625, "epoch": 1.7249866238630283, "grad_norm": 90.05574798583984, "kl": 1.2675079107284546, "learning_rate": 2.2823690699495645e-06, "loss": 0.0507, "reward": 2.371218681335449, "reward_std": 1.1633809804916382, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.433718740940094, "step": 3224 }, { "completion_length": 123.875, "epoch": 1.725521669341894, "grad_norm": 1.158337116241455, "kl": 0.3042198717594147, "learning_rate": 2.280818544283827e-06, "loss": 0.0122, "reward": 2.325031280517578, "reward_std": 0.9584804773330688, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46565625071525574, "step": 3225 }, { "completion_length": 133.0625, "epoch": 1.7260567148207597, "grad_norm": 4.664214134216309, "kl": 0.5346508026123047, "learning_rate": 2.2792681035770212e-06, "loss": 0.0214, "reward": 2.14453125, "reward_std": 0.7762045860290527, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48828125, "step": 3226 }, { "completion_length": 108.34375, "epoch": 1.7265917602996255, "grad_norm": 1.5897879600524902, "kl": 0.1991264522075653, "learning_rate": 2.277717748430128e-06, "loss": 0.008, "reward": 2.2783751487731934, "reward_std": 0.78839111328125, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4814999997615814, "step": 3227 }, { "completion_length": 137.9375, "epoch": 1.7271268057784912, "grad_norm": 1.386596918106079, "kl": 0.12777923047542572, "learning_rate": 2.2761674794440958e-06, "loss": 0.0051, "reward": 2.1537814140319824, "reward_std": 1.0059640407562256, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4350312650203705, "step": 3228 }, { "completion_length": 135.03125, "epoch": 1.727661851257357, "grad_norm": 4.876460075378418, "kl": 0.44767239689826965, "learning_rate": 2.274617297219837e-06, "loss": 0.0179, "reward": 1.3817499876022339, "reward_std": 0.6926093101501465, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4442499876022339, "step": 3229 }, { "completion_length": 170.6875, "epoch": 1.7281968967362227, "grad_norm": 1.088972806930542, "kl": 0.20346221327781677, "learning_rate": 2.273067202358234e-06, "loss": 0.0081, "reward": 1.3870313167572021, "reward_std": 0.7202945947647095, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.3870312571525574, "step": 3230 }, { "completion_length": 133.0625, "epoch": 1.7287319422150884, "grad_norm": 2.1540138721466064, "kl": 0.15649311244487762, "learning_rate": 2.2715171954601303e-06, "loss": 0.0063, "reward": 2.3348751068115234, "reward_std": 0.6152481436729431, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4598749876022339, "step": 3231 }, { "completion_length": 134.625, "epoch": 1.7292669876939541, "grad_norm": 0.8920719623565674, "kl": 0.13389301300048828, "learning_rate": 2.2699672771263405e-06, "loss": 0.0054, "reward": 2.1169376373291016, "reward_std": 0.6250877976417542, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4763124883174896, "step": 3232 }, { "completion_length": 136.21875, "epoch": 1.7298020331728197, "grad_norm": 1.8450303077697754, "kl": 0.23067280650138855, "learning_rate": 2.268417447957643e-06, "loss": 0.0092, "reward": 2.0131874084472656, "reward_std": 0.7966048717498779, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4506874978542328, "step": 3233 }, { "completion_length": 148.75, "epoch": 1.7303370786516854, "grad_norm": 1.1895160675048828, "kl": 0.20246009528636932, "learning_rate": 2.266867708554779e-06, "loss": 0.0081, "reward": 2.2368435859680176, "reward_std": 0.9069904685020447, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4087187349796295, "step": 3234 }, { "completion_length": 123.90625, "epoch": 1.7308721241305511, "grad_norm": 0.7593695521354675, "kl": 0.23999856412410736, "learning_rate": 2.2653180595184605e-06, "loss": 0.0096, "reward": 2.6370935440063477, "reward_std": 0.540147066116333, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48084375262260437, "step": 3235 }, { "completion_length": 125.125, "epoch": 1.7314071696094167, "grad_norm": 7.365932464599609, "kl": 0.49081146717071533, "learning_rate": 2.2637685014493576e-06, "loss": 0.0196, "reward": 1.9387187957763672, "reward_std": 0.6311416625976562, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4543437361717224, "step": 3236 }, { "completion_length": 127.03125, "epoch": 1.7319422150882824, "grad_norm": 1.0712053775787354, "kl": 0.18661780655384064, "learning_rate": 2.26221903494811e-06, "loss": 0.0075, "reward": 2.1200623512268066, "reward_std": 0.7389329075813293, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4950624704360962, "step": 3237 }, { "completion_length": 119.78125, "epoch": 1.7324772605671481, "grad_norm": 1.174810528755188, "kl": 0.24776360392570496, "learning_rate": 2.2606696606153227e-06, "loss": 0.0099, "reward": 2.803906202316284, "reward_std": 0.8715996146202087, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.47578126192092896, "step": 3238 }, { "completion_length": 128.125, "epoch": 1.7330123060460139, "grad_norm": 1.0109906196594238, "kl": 0.18758954107761383, "learning_rate": 2.25912037905156e-06, "loss": 0.0075, "reward": 1.7214686870574951, "reward_std": 0.5621557831764221, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4870937764644623, "step": 3239 }, { "completion_length": 138.71875, "epoch": 1.7335473515248796, "grad_norm": 1.0036991834640503, "kl": 0.18988674879074097, "learning_rate": 2.257571190857355e-06, "loss": 0.0076, "reward": 1.1446561813354492, "reward_std": 0.3701785206794739, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.441531240940094, "step": 3240 }, { "completion_length": 127.0625, "epoch": 1.7340823970037453, "grad_norm": 0.8904069066047668, "kl": 0.18522082269191742, "learning_rate": 2.256022096633201e-06, "loss": 0.0074, "reward": 2.872406244277954, "reward_std": 0.6975911855697632, "rewards/correctness_reward_func": 1.5625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4505312442779541, "step": 3241 }, { "completion_length": 118.09375, "epoch": 1.734617442482611, "grad_norm": 0.9641221761703491, "kl": 0.2614685297012329, "learning_rate": 2.254473096979558e-06, "loss": 0.0105, "reward": 2.151343822479248, "reward_std": 0.8431257009506226, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46384376287460327, "step": 3242 }, { "completion_length": 139.625, "epoch": 1.7351524879614768, "grad_norm": 0.7945654988288879, "kl": 0.12616978585720062, "learning_rate": 2.2529241924968477e-06, "loss": 0.005, "reward": 1.8359375, "reward_std": 0.5679908990859985, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4453125, "step": 3243 }, { "completion_length": 155.0625, "epoch": 1.7356875334403425, "grad_norm": 1.2562881708145142, "kl": 0.22328312695026398, "learning_rate": 2.251375383785455e-06, "loss": 0.0089, "reward": 1.9124062061309814, "reward_std": 0.8519997596740723, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4280312657356262, "step": 3244 }, { "completion_length": 130.4375, "epoch": 1.7362225789192083, "grad_norm": 0.82586669921875, "kl": 0.1555856466293335, "learning_rate": 2.2498266714457277e-06, "loss": 0.0062, "reward": 1.8130625486373901, "reward_std": 0.6021952629089355, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.45368748903274536, "step": 3245 }, { "completion_length": 152.125, "epoch": 1.7367576243980738, "grad_norm": 0.9211640954017639, "kl": 0.1450071930885315, "learning_rate": 2.2482780560779767e-06, "loss": 0.0058, "reward": 1.76381254196167, "reward_std": 0.5522081255912781, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.42006248235702515, "step": 3246 }, { "completion_length": 122.25, "epoch": 1.7372926698769395, "grad_norm": 24.866586685180664, "kl": 0.40961796045303345, "learning_rate": 2.2467295382824743e-06, "loss": 0.0164, "reward": 2.307406187057495, "reward_std": 0.8766547441482544, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4949062466621399, "step": 3247 }, { "completion_length": 119.5, "epoch": 1.7378277153558053, "grad_norm": 0.5760522484779358, "kl": 0.1502894163131714, "learning_rate": 2.245181118659457e-06, "loss": 0.006, "reward": 2.106750011444092, "reward_std": 0.4185950756072998, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4817500114440918, "step": 3248 }, { "completion_length": 121.5, "epoch": 1.7383627608346708, "grad_norm": 1.1039100885391235, "kl": 0.15953674912452698, "learning_rate": 2.243632797809121e-06, "loss": 0.0064, "reward": 2.115281105041504, "reward_std": 0.48111802339553833, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47465625405311584, "step": 3249 }, { "completion_length": 110.03125, "epoch": 1.7388978063135365, "grad_norm": 1.3030201196670532, "kl": 0.24924910068511963, "learning_rate": 2.2420845763316245e-06, "loss": 0.01, "reward": 1.9796249866485596, "reward_std": 0.6527231931686401, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47962498664855957, "step": 3250 }, { "completion_length": 147.5, "epoch": 1.7394328517924023, "grad_norm": 2.925873041152954, "kl": 0.36858388781547546, "learning_rate": 2.240536454827089e-06, "loss": 0.0147, "reward": 1.682843804359436, "reward_std": 0.9008716940879822, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.28125, "rewards/xmlcount_reward_func": 0.38596874475479126, "step": 3251 }, { "completion_length": 123.21875, "epoch": 1.739967897271268, "grad_norm": 0.8596987128257751, "kl": 0.22993667423725128, "learning_rate": 2.2389884338955954e-06, "loss": 0.0092, "reward": 1.4939374923706055, "reward_std": 0.6896212100982666, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46268749237060547, "step": 3252 }, { "completion_length": 141.96875, "epoch": 1.7405029427501337, "grad_norm": 0.6912164092063904, "kl": 0.17577575147151947, "learning_rate": 2.2374405141371877e-06, "loss": 0.007, "reward": 1.8276562690734863, "reward_std": 0.6402834057807922, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.42140626907348633, "step": 3253 }, { "completion_length": 139.34375, "epoch": 1.7410379882289995, "grad_norm": 2.1032216548919678, "kl": 0.2057492583990097, "learning_rate": 2.235892696151866e-06, "loss": 0.0082, "reward": 2.282531261444092, "reward_std": 0.771824300289154, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4387812614440918, "step": 3254 }, { "completion_length": 143.78125, "epoch": 1.7415730337078652, "grad_norm": 4.239270210266113, "kl": 0.2690420150756836, "learning_rate": 2.2343449805395973e-06, "loss": 0.0108, "reward": 1.8486249446868896, "reward_std": 0.9707812070846558, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4111250042915344, "step": 3255 }, { "completion_length": 144.59375, "epoch": 1.742108079186731, "grad_norm": 0.9807921051979065, "kl": 0.12972678244113922, "learning_rate": 2.2327973679003057e-06, "loss": 0.0052, "reward": 2.1093437671661377, "reward_std": 0.853806734085083, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4687187671661377, "step": 3256 }, { "completion_length": 147.8125, "epoch": 1.7426431246655967, "grad_norm": 1.7495853900909424, "kl": 0.2002137452363968, "learning_rate": 2.231249858833873e-06, "loss": 0.008, "reward": 1.530250072479248, "reward_std": 0.8059548735618591, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.42087501287460327, "step": 3257 }, { "completion_length": 122.65625, "epoch": 1.7431781701444624, "grad_norm": 0.670139729976654, "kl": 0.194553405046463, "learning_rate": 2.2297024539401463e-06, "loss": 0.0078, "reward": 2.2338438034057617, "reward_std": 0.4388889670372009, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46821874380111694, "step": 3258 }, { "completion_length": 133.34375, "epoch": 1.743713215623328, "grad_norm": 0.48832395672798157, "kl": 0.18752887845039368, "learning_rate": 2.228155153818927e-06, "loss": 0.0075, "reward": 2.188906192779541, "reward_std": 0.804332435131073, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4389062523841858, "step": 3259 }, { "completion_length": 146.375, "epoch": 1.7442482611021937, "grad_norm": 2.9040541648864746, "kl": 0.1798097789287567, "learning_rate": 2.2266079590699795e-06, "loss": 0.0072, "reward": 1.4181874990463257, "reward_std": 0.601876974105835, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.3713124990463257, "step": 3260 }, { "completion_length": 139.03125, "epoch": 1.7447833065810594, "grad_norm": 1.2898852825164795, "kl": 0.18281880021095276, "learning_rate": 2.2250608702930266e-06, "loss": 0.0073, "reward": 1.973343849182129, "reward_std": 0.7209975719451904, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.45771875977516174, "step": 3261 }, { "completion_length": 127.15625, "epoch": 1.7453183520599251, "grad_norm": 1.0879088640213013, "kl": 0.23867201805114746, "learning_rate": 2.2235138880877483e-06, "loss": 0.0095, "reward": 2.5177812576293945, "reward_std": 1.089781403541565, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48653125762939453, "step": 3262 }, { "completion_length": 137.65625, "epoch": 1.7458533975387907, "grad_norm": 0.9060973525047302, "kl": 0.15959350764751434, "learning_rate": 2.221967013053786e-06, "loss": 0.0064, "reward": 2.010624885559082, "reward_std": 0.5934900641441345, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4637500047683716, "step": 3263 }, { "completion_length": 143.46875, "epoch": 1.7463884430176564, "grad_norm": 1.839920163154602, "kl": 0.1705307960510254, "learning_rate": 2.2204202457907366e-06, "loss": 0.0068, "reward": 1.7273437976837158, "reward_std": 0.6374247670173645, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.43046873807907104, "step": 3264 }, { "completion_length": 149.875, "epoch": 1.7469234884965221, "grad_norm": 0.81582111120224, "kl": 0.1724143773317337, "learning_rate": 2.218873586898157e-06, "loss": 0.0069, "reward": 1.226312518119812, "reward_std": 0.7127342224121094, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.3981874883174896, "step": 3265 }, { "completion_length": 140.125, "epoch": 1.7474585339753879, "grad_norm": 0.7978034019470215, "kl": 0.20114868879318237, "learning_rate": 2.217327036975563e-06, "loss": 0.008, "reward": 1.7568124532699585, "reward_std": 0.4130913019180298, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.44431251287460327, "step": 3266 }, { "completion_length": 119.9375, "epoch": 1.7479935794542536, "grad_norm": 1.0868289470672607, "kl": 0.1996924877166748, "learning_rate": 2.215780596622426e-06, "loss": 0.008, "reward": 2.3812499046325684, "reward_std": 0.6951901912689209, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4593749940395355, "step": 3267 }, { "completion_length": 126.6875, "epoch": 1.7485286249331193, "grad_norm": 1.3508050441741943, "kl": 0.22607478499412537, "learning_rate": 2.2142342664381764e-06, "loss": 0.009, "reward": 2.6522812843322754, "reward_std": 0.6042149662971497, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.433531254529953, "step": 3268 }, { "completion_length": 131.8125, "epoch": 1.749063670411985, "grad_norm": 0.8206002712249756, "kl": 0.21470975875854492, "learning_rate": 2.2126880470222007e-06, "loss": 0.0086, "reward": 2.097062587738037, "reward_std": 0.4262017011642456, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.48768749833106995, "step": 3269 }, { "completion_length": 116.1875, "epoch": 1.7495987158908508, "grad_norm": 0.5314321517944336, "kl": 0.15884184837341309, "learning_rate": 2.2111419389738433e-06, "loss": 0.0064, "reward": 1.6406562328338623, "reward_std": 0.8065899014472961, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4687812328338623, "step": 3270 }, { "completion_length": 120.3125, "epoch": 1.7501337613697165, "grad_norm": 1.4267405271530151, "kl": 0.17142769694328308, "learning_rate": 2.2095959428924067e-06, "loss": 0.0069, "reward": 1.6434375047683716, "reward_std": 0.48659056425094604, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4715625047683716, "step": 3271 }, { "completion_length": 120.875, "epoch": 1.7506688068485823, "grad_norm": 1.3398966789245605, "kl": 0.18028634786605835, "learning_rate": 2.2080500593771464e-06, "loss": 0.0072, "reward": 1.9274063110351562, "reward_std": 0.7070645093917847, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4430312514305115, "step": 3272 }, { "completion_length": 115.46875, "epoch": 1.7512038523274478, "grad_norm": 1.2099337577819824, "kl": 0.15560701489448547, "learning_rate": 2.2065042890272793e-06, "loss": 0.0062, "reward": 2.7130625247955322, "reward_std": 0.5538989305496216, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.49431249499320984, "step": 3273 }, { "completion_length": 142.125, "epoch": 1.7517388978063135, "grad_norm": 1.0146211385726929, "kl": 0.14696693420410156, "learning_rate": 2.204958632441972e-06, "loss": 0.0059, "reward": 2.4543750286102295, "reward_std": 0.9023129343986511, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4699999988079071, "step": 3274 }, { "completion_length": 126.625, "epoch": 1.7522739432851793, "grad_norm": 2.18161678314209, "kl": 0.16625100374221802, "learning_rate": 2.203413090220353e-06, "loss": 0.0067, "reward": 2.330312490463257, "reward_std": 0.7074183821678162, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.45531249046325684, "step": 3275 }, { "completion_length": 146.8125, "epoch": 1.7528089887640448, "grad_norm": 2.7145535945892334, "kl": 0.20545372366905212, "learning_rate": 2.2018676629615046e-06, "loss": 0.0082, "reward": 1.5418124198913574, "reward_std": 0.8115454912185669, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4480625092983246, "step": 3276 }, { "completion_length": 127.0625, "epoch": 1.7533440342429105, "grad_norm": 1.7636909484863281, "kl": 0.25649160146713257, "learning_rate": 2.2003223512644617e-06, "loss": 0.0103, "reward": 2.195406198501587, "reward_std": 0.9074314832687378, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4766562581062317, "step": 3277 }, { "completion_length": 146.0625, "epoch": 1.7538790797217763, "grad_norm": 1.8143830299377441, "kl": 0.20566409826278687, "learning_rate": 2.1987771557282195e-06, "loss": 0.0082, "reward": 1.497093677520752, "reward_std": 0.5307521820068359, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46584373712539673, "step": 3278 }, { "completion_length": 114.625, "epoch": 1.754414125200642, "grad_norm": 0.7063806056976318, "kl": 0.20892682671546936, "learning_rate": 2.1972320769517226e-06, "loss": 0.0084, "reward": 2.1244688034057617, "reward_std": 0.7864822149276733, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.45259374380111694, "step": 3279 }, { "completion_length": 125.90625, "epoch": 1.7549491706795077, "grad_norm": 0.8012524247169495, "kl": 0.14902037382125854, "learning_rate": 2.1956871155338742e-06, "loss": 0.006, "reward": 2.2973124980926514, "reward_std": 0.8266226649284363, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.46918749809265137, "step": 3280 }, { "completion_length": 139.8125, "epoch": 1.7554842161583735, "grad_norm": 0.7726689577102661, "kl": 0.18034303188323975, "learning_rate": 2.1941422720735334e-06, "loss": 0.0072, "reward": 2.308187484741211, "reward_std": 0.845982551574707, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.41756248474121094, "step": 3281 }, { "completion_length": 127.375, "epoch": 1.7560192616372392, "grad_norm": 0.752392590045929, "kl": 0.1525973081588745, "learning_rate": 2.1925975471695075e-06, "loss": 0.0061, "reward": 2.181906223297119, "reward_std": 0.731747031211853, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.47878125309944153, "step": 3282 }, { "completion_length": 137.5, "epoch": 1.756554307116105, "grad_norm": 1.3799035549163818, "kl": 0.2316935658454895, "learning_rate": 2.1910529414205624e-06, "loss": 0.0093, "reward": 1.813156247138977, "reward_std": 0.6559699773788452, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.42253124713897705, "step": 3283 }, { "completion_length": 136.28125, "epoch": 1.7570893525949707, "grad_norm": 1.223997712135315, "kl": 0.21661441028118134, "learning_rate": 2.1895084554254187e-06, "loss": 0.0087, "reward": 2.0833749771118164, "reward_std": 0.8898701071739197, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4114999771118164, "step": 3284 }, { "completion_length": 120.4375, "epoch": 1.7576243980738364, "grad_norm": 1.1342074871063232, "kl": 0.22098717093467712, "learning_rate": 2.187964089782747e-06, "loss": 0.0088, "reward": 2.3264689445495605, "reward_std": 1.1502076387405396, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4670937657356262, "step": 3285 }, { "completion_length": 130.0, "epoch": 1.758159443552702, "grad_norm": 1.9326750040054321, "kl": 0.18249543011188507, "learning_rate": 2.1864198450911736e-06, "loss": 0.0073, "reward": 2.0707812309265137, "reward_std": 0.7191476821899414, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46140626072883606, "step": 3286 }, { "completion_length": 148.65625, "epoch": 1.7586944890315677, "grad_norm": 0.6988814473152161, "kl": 0.12097938358783722, "learning_rate": 2.184875721949277e-06, "loss": 0.0048, "reward": 1.7265312671661377, "reward_std": 1.024060606956482, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4296562671661377, "step": 3287 }, { "completion_length": 132.1875, "epoch": 1.7592295345104334, "grad_norm": 0.8723441958427429, "kl": 0.22438254952430725, "learning_rate": 2.183331720955589e-06, "loss": 0.009, "reward": 1.4655624628067017, "reward_std": 0.5652958750724792, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.41868749260902405, "step": 3288 }, { "completion_length": 109.71875, "epoch": 1.7597645799892991, "grad_norm": 0.9434627890586853, "kl": 0.18400803208351135, "learning_rate": 2.181787842708595e-06, "loss": 0.0074, "reward": 2.5177812576293945, "reward_std": 0.8905234336853027, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48653125762939453, "step": 3289 }, { "completion_length": 146.78125, "epoch": 1.7602996254681647, "grad_norm": 1.2695988416671753, "kl": 0.1815491020679474, "learning_rate": 2.18024408780673e-06, "loss": 0.0073, "reward": 1.8341562747955322, "reward_std": 0.9466429948806763, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.42790624499320984, "step": 3290 }, { "completion_length": 133.25, "epoch": 1.7608346709470304, "grad_norm": 0.8579332232475281, "kl": 0.15363465249538422, "learning_rate": 2.1787004568483848e-06, "loss": 0.0061, "reward": 2.5179061889648438, "reward_std": 0.7377842664718628, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4710312485694885, "step": 3291 }, { "completion_length": 136.46875, "epoch": 1.7613697164258961, "grad_norm": 1.1229231357574463, "kl": 0.28423207998275757, "learning_rate": 2.1771569504318987e-06, "loss": 0.0114, "reward": 2.5596561431884766, "reward_std": 1.1487531661987305, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4659062623977661, "step": 3292 }, { "completion_length": 122.65625, "epoch": 1.7619047619047619, "grad_norm": 2.1525650024414062, "kl": 0.18154022097587585, "learning_rate": 2.175613569155565e-06, "loss": 0.0073, "reward": 1.3419687747955322, "reward_std": 0.41281643509864807, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.43571874499320984, "step": 3293 }, { "completion_length": 125.375, "epoch": 1.7624398073836276, "grad_norm": 1.086317777633667, "kl": 0.2759593725204468, "learning_rate": 2.1740703136176287e-06, "loss": 0.011, "reward": 2.509000062942505, "reward_std": 0.7449996471405029, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4621250033378601, "step": 3294 }, { "completion_length": 109.0625, "epoch": 1.7629748528624933, "grad_norm": 3.874253273010254, "kl": 0.286376416683197, "learning_rate": 2.1725271844162844e-06, "loss": 0.0115, "reward": 2.064218759536743, "reward_std": 0.7046195268630981, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.43921875953674316, "step": 3295 }, { "completion_length": 137.03125, "epoch": 1.763509898341359, "grad_norm": 0.964499831199646, "kl": 0.17100223898887634, "learning_rate": 2.1709841821496784e-06, "loss": 0.0068, "reward": 1.6278125047683716, "reward_std": 0.4754379987716675, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4715625047683716, "step": 3296 }, { "completion_length": 119.90625, "epoch": 1.7640449438202248, "grad_norm": 0.9171754717826843, "kl": 0.20669031143188477, "learning_rate": 2.1694413074159084e-06, "loss": 0.0083, "reward": 1.3779375553131104, "reward_std": 0.45684707164764404, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4716874957084656, "step": 3297 }, { "completion_length": 146.53125, "epoch": 1.7645799892990905, "grad_norm": 5.036703586578369, "kl": 0.2632571756839752, "learning_rate": 2.1678985608130215e-06, "loss": 0.0105, "reward": 2.0266873836517334, "reward_std": 1.2432661056518555, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.43293750286102295, "step": 3298 }, { "completion_length": 109.5, "epoch": 1.7651150347779563, "grad_norm": 36.41787338256836, "kl": 8.875969886779785, "learning_rate": 2.166355942939017e-06, "loss": 0.355, "reward": 2.4566874504089355, "reward_std": 0.5996795892715454, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4723125100135803, "step": 3299 }, { "completion_length": 105.125, "epoch": 1.7656500802568218, "grad_norm": 1.0073986053466797, "kl": 0.2798752188682556, "learning_rate": 2.1648134543918424e-06, "loss": 0.0112, "reward": 2.1500625610351562, "reward_std": 0.8665801286697388, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4625625014305115, "step": 3300 }, { "completion_length": 123.8125, "epoch": 1.7661851257356875, "grad_norm": 0.5992027521133423, "kl": 0.20215752720832825, "learning_rate": 2.1632710957693964e-06, "loss": 0.0081, "reward": 2.0848751068115234, "reward_std": 0.7567019462585449, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4598749876022339, "step": 3301 }, { "completion_length": 128.5625, "epoch": 1.7667201712145533, "grad_norm": 0.8541681170463562, "kl": 0.2829572558403015, "learning_rate": 2.1617288676695255e-06, "loss": 0.0113, "reward": 2.149250030517578, "reward_std": 1.1187708377838135, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4461250305175781, "step": 3302 }, { "completion_length": 122.625, "epoch": 1.7672552166934188, "grad_norm": 2.7121152877807617, "kl": 0.28326019644737244, "learning_rate": 2.160186770690027e-06, "loss": 0.0113, "reward": 2.2933125495910645, "reward_std": 0.705820620059967, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4808124899864197, "step": 3303 }, { "completion_length": 120.6875, "epoch": 1.7677902621722845, "grad_norm": 0.7072043418884277, "kl": 0.1675284206867218, "learning_rate": 2.1586448054286497e-06, "loss": 0.0067, "reward": 2.5464999675750732, "reward_std": 0.6848605871200562, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.49962499737739563, "step": 3304 }, { "completion_length": 137.3125, "epoch": 1.7683253076511503, "grad_norm": 2.2906301021575928, "kl": 0.24059607088565826, "learning_rate": 2.157102972483086e-06, "loss": 0.0096, "reward": 1.8062812089920044, "reward_std": 0.862617552280426, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4469062387943268, "step": 3305 }, { "completion_length": 150.34375, "epoch": 1.768860353130016, "grad_norm": 0.4920526444911957, "kl": 0.1322668045759201, "learning_rate": 2.155561272450981e-06, "loss": 0.0053, "reward": 1.980875015258789, "reward_std": 0.9384326934814453, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.41837501525878906, "step": 3306 }, { "completion_length": 137.40625, "epoch": 1.7693953986088817, "grad_norm": 1.173323631286621, "kl": 0.2055736780166626, "learning_rate": 2.1540197059299272e-06, "loss": 0.0082, "reward": 1.9819061756134033, "reward_std": 0.9059069752693176, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4662812352180481, "step": 3307 }, { "completion_length": 141.03125, "epoch": 1.7699304440877475, "grad_norm": 0.9174509644508362, "kl": 0.18473629653453827, "learning_rate": 2.152478273517465e-06, "loss": 0.0074, "reward": 1.726312518119812, "reward_std": 1.0433013439178467, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4606874883174896, "step": 3308 }, { "completion_length": 145.0, "epoch": 1.7704654895666132, "grad_norm": 0.7613897919654846, "kl": 0.14943113923072815, "learning_rate": 2.150936975811084e-06, "loss": 0.006, "reward": 2.1489999294281006, "reward_std": 1.1340936422348022, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.44587498903274536, "step": 3309 }, { "completion_length": 126.28125, "epoch": 1.771000535045479, "grad_norm": 0.6665445566177368, "kl": 0.19530326128005981, "learning_rate": 2.1493958134082196e-06, "loss": 0.0078, "reward": 1.5900624990463257, "reward_std": 0.5384705662727356, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4806874990463257, "step": 3310 }, { "completion_length": 132.40625, "epoch": 1.7715355805243447, "grad_norm": 1.658119559288025, "kl": 0.21230748295783997, "learning_rate": 2.147854786906257e-06, "loss": 0.0085, "reward": 2.1035938262939453, "reward_std": 0.5059679746627808, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.44734376668930054, "step": 3311 }, { "completion_length": 113.09375, "epoch": 1.7720706260032104, "grad_norm": 1.5633541345596313, "kl": 0.2106967568397522, "learning_rate": 2.1463138969025264e-06, "loss": 0.0084, "reward": 2.151562452316284, "reward_std": 0.5931735038757324, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47968751192092896, "step": 3312 }, { "completion_length": 131.40625, "epoch": 1.772605671482076, "grad_norm": 0.7705098390579224, "kl": 0.1940741240978241, "learning_rate": 2.144773143994307e-06, "loss": 0.0078, "reward": 2.265031337738037, "reward_std": 0.6957350969314575, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46815624833106995, "step": 3313 }, { "completion_length": 121.46875, "epoch": 1.7731407169609417, "grad_norm": 1.819884181022644, "kl": 0.14908091723918915, "learning_rate": 2.1432325287788243e-06, "loss": 0.006, "reward": 1.9453125, "reward_std": 0.7822936773300171, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4765625, "step": 3314 }, { "completion_length": 134.71875, "epoch": 1.7736757624398074, "grad_norm": 2.0872886180877686, "kl": 0.1875016987323761, "learning_rate": 2.14169205185325e-06, "loss": 0.0075, "reward": 1.328125, "reward_std": 0.1804211586713791, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.5, "step": 3315 }, { "completion_length": 129.625, "epoch": 1.7742108079186731, "grad_norm": 1.4266279935836792, "kl": 0.26113200187683105, "learning_rate": 2.140151713814702e-06, "loss": 0.0104, "reward": 1.8474375009536743, "reward_std": 0.6286625862121582, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4724375009536743, "step": 3316 }, { "completion_length": 147.96875, "epoch": 1.7747458533975387, "grad_norm": 5.271953582763672, "kl": 0.24463322758674622, "learning_rate": 2.1386115152602463e-06, "loss": 0.0098, "reward": 1.2994999885559082, "reward_std": 0.5050595998764038, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4557499885559082, "step": 3317 }, { "completion_length": 130.9375, "epoch": 1.7752808988764044, "grad_norm": 0.4890052378177643, "kl": 0.1661757081747055, "learning_rate": 2.137071456786892e-06, "loss": 0.0066, "reward": 2.796875, "reward_std": 0.5745242834091187, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 3318 }, { "completion_length": 129.8125, "epoch": 1.7758159443552701, "grad_norm": 0.9298141002655029, "kl": 0.15324899554252625, "learning_rate": 2.135531538991596e-06, "loss": 0.0061, "reward": 1.819968819618225, "reward_std": 1.2093040943145752, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4137187600135803, "step": 3319 }, { "completion_length": 125.5, "epoch": 1.7763509898341359, "grad_norm": 0.705161988735199, "kl": 0.16182780265808105, "learning_rate": 2.133991762471259e-06, "loss": 0.0065, "reward": 2.398843765258789, "reward_std": 0.7192091941833496, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46134376525878906, "step": 3320 }, { "completion_length": 137.625, "epoch": 1.7768860353130016, "grad_norm": 1.3863669633865356, "kl": 0.17063657939434052, "learning_rate": 2.1324521278227286e-06, "loss": 0.0068, "reward": 2.1508750915527344, "reward_std": 0.943301260471344, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4165000021457672, "step": 3321 }, { "completion_length": 142.65625, "epoch": 1.7774210807918673, "grad_norm": 2.0394785404205322, "kl": 0.17330920696258545, "learning_rate": 2.130912635642797e-06, "loss": 0.0069, "reward": 1.454031229019165, "reward_std": 0.9625676870346069, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.39153122901916504, "step": 3322 }, { "completion_length": 128.5, "epoch": 1.777956126270733, "grad_norm": 2.1045870780944824, "kl": 0.19589880108833313, "learning_rate": 2.1293732865282006e-06, "loss": 0.0078, "reward": 2.6684062480926514, "reward_std": 0.8203758001327515, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.46528124809265137, "step": 3323 }, { "completion_length": 141.375, "epoch": 1.7784911717495988, "grad_norm": 0.955849826335907, "kl": 0.15422073006629944, "learning_rate": 2.127834081075621e-06, "loss": 0.0062, "reward": 1.0589687824249268, "reward_std": 0.3593980073928833, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.402718722820282, "step": 3324 }, { "completion_length": 155.625, "epoch": 1.7790262172284645, "grad_norm": 0.720910906791687, "kl": 0.13828879594802856, "learning_rate": 2.126295019881682e-06, "loss": 0.0055, "reward": 1.9220937490463257, "reward_std": 0.9559413194656372, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4533437490463257, "step": 3325 }, { "completion_length": 149.96875, "epoch": 1.7795612627073303, "grad_norm": 0.6546357870101929, "kl": 0.12103982269763947, "learning_rate": 2.124756103542955e-06, "loss": 0.0048, "reward": 1.59765625, "reward_std": 0.7572771310806274, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.47265625, "step": 3326 }, { "completion_length": 159.84375, "epoch": 1.7800963081861958, "grad_norm": 3.1623005867004395, "kl": 0.13280120491981506, "learning_rate": 2.1232173326559534e-06, "loss": 0.0053, "reward": 1.706531286239624, "reward_std": 1.2078689336776733, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.39403125643730164, "step": 3327 }, { "completion_length": 146.4375, "epoch": 1.7806313536650615, "grad_norm": 0.9004192352294922, "kl": 0.1214844360947609, "learning_rate": 2.1216787078171334e-06, "loss": 0.0049, "reward": 1.5486249923706055, "reward_std": 0.7150908708572388, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.45487499237060547, "step": 3328 }, { "completion_length": 134.53125, "epoch": 1.7811663991439273, "grad_norm": 0.83284592628479, "kl": 0.1785348653793335, "learning_rate": 2.120140229622896e-06, "loss": 0.0071, "reward": 2.297281265258789, "reward_std": 0.5071858167648315, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.42228126525878906, "step": 3329 }, { "completion_length": 152.125, "epoch": 1.7817014446227928, "grad_norm": 1.7858211994171143, "kl": 0.1374334990978241, "learning_rate": 2.1186018986695844e-06, "loss": 0.0055, "reward": 2.5688438415527344, "reward_std": 0.8085944652557373, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4594687521457672, "step": 3330 }, { "completion_length": 151.5625, "epoch": 1.7822364901016585, "grad_norm": 0.5412238836288452, "kl": 0.14229945838451385, "learning_rate": 2.1170637155534854e-06, "loss": 0.0057, "reward": 1.847749948501587, "reward_std": 0.5065327882766724, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4571250081062317, "step": 3331 }, { "completion_length": 138.59375, "epoch": 1.7827715355805243, "grad_norm": 3.4478631019592285, "kl": 0.29935652017593384, "learning_rate": 2.115525680870829e-06, "loss": 0.012, "reward": 2.2397186756134033, "reward_std": 0.47387808561325073, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4740937352180481, "step": 3332 }, { "completion_length": 117.625, "epoch": 1.78330658105939, "grad_norm": 1.7767179012298584, "kl": 0.2723739743232727, "learning_rate": 2.113987795217785e-06, "loss": 0.0109, "reward": 2.22265625, "reward_std": 0.8767045140266418, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.48828125, "step": 3333 }, { "completion_length": 124.125, "epoch": 1.7838416265382557, "grad_norm": 1.6280877590179443, "kl": 0.2557834982872009, "learning_rate": 2.11245005919047e-06, "loss": 0.0102, "reward": 1.8839062452316284, "reward_std": 0.9599726796150208, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4776562452316284, "step": 3334 }, { "completion_length": 122.84375, "epoch": 1.7843766720171215, "grad_norm": 1.600587248802185, "kl": 0.21875208616256714, "learning_rate": 2.1109124733849377e-06, "loss": 0.0088, "reward": 2.78515625, "reward_std": 1.0544252395629883, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48828125, "step": 3335 }, { "completion_length": 116.90625, "epoch": 1.7849117174959872, "grad_norm": 1.0880789756774902, "kl": 0.19235306978225708, "learning_rate": 2.1093750383971864e-06, "loss": 0.0077, "reward": 2.6660313606262207, "reward_std": 0.7294163107872009, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.49415624141693115, "step": 3336 }, { "completion_length": 156.3125, "epoch": 1.785446762974853, "grad_norm": 1.5292731523513794, "kl": 0.20191729068756104, "learning_rate": 2.1078377548231563e-06, "loss": 0.0081, "reward": 1.4957499504089355, "reward_std": 0.6692349314689636, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4020000100135803, "step": 3337 }, { "completion_length": 113.8125, "epoch": 1.7859818084537187, "grad_norm": 0.8389192223548889, "kl": 0.17667177319526672, "learning_rate": 2.106300623258727e-06, "loss": 0.0071, "reward": 2.8671875, "reward_std": 0.9974396824836731, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4609375, "step": 3338 }, { "completion_length": 113.28125, "epoch": 1.7865168539325844, "grad_norm": 9.45529556274414, "kl": 0.9792344570159912, "learning_rate": 2.1047636442997205e-06, "loss": 0.0392, "reward": 2.511218786239624, "reward_std": 0.45797741413116455, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.49559375643730164, "step": 3339 }, { "completion_length": 124.625, "epoch": 1.78705189941145, "grad_norm": 1.8719701766967773, "kl": 0.16338388621807098, "learning_rate": 2.103226818541899e-06, "loss": 0.0065, "reward": 2.09765625, "reward_std": 0.5262158513069153, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48828125, "step": 3340 }, { "completion_length": 132.40625, "epoch": 1.7875869448903157, "grad_norm": 0.7012263536453247, "kl": 0.17072321474552155, "learning_rate": 2.1016901465809657e-06, "loss": 0.0068, "reward": 1.3536875247955322, "reward_std": 0.5788264274597168, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.44743749499320984, "step": 3341 }, { "completion_length": 148.3125, "epoch": 1.7881219903691814, "grad_norm": 2.1144227981567383, "kl": 0.22809211909770966, "learning_rate": 2.100153629012565e-06, "loss": 0.0091, "reward": 2.1405937671661377, "reward_std": 0.9926818013191223, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4530937671661377, "step": 3342 }, { "completion_length": 123.28125, "epoch": 1.7886570358480471, "grad_norm": 0.717788577079773, "kl": 0.169842928647995, "learning_rate": 2.098617266432279e-06, "loss": 0.0068, "reward": 2.421875, "reward_std": 0.4027418792247772, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.5, "step": 3343 }, { "completion_length": 134.65625, "epoch": 1.7891920813269127, "grad_norm": 1.7345184087753296, "kl": 0.2845722734928131, "learning_rate": 2.0970810594356335e-06, "loss": 0.0114, "reward": 2.120500087738037, "reward_std": 0.6065570116043091, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.44862499833106995, "step": 3344 }, { "completion_length": 134.25, "epoch": 1.7897271268057784, "grad_norm": 0.810120701789856, "kl": 0.15277864038944244, "learning_rate": 2.0955450086180883e-06, "loss": 0.0061, "reward": 2.5682811737060547, "reward_std": 0.98177170753479, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.49015623331069946, "step": 3345 }, { "completion_length": 135.0, "epoch": 1.7902621722846441, "grad_norm": 1.0301947593688965, "kl": 0.1647503823041916, "learning_rate": 2.094009114575049e-06, "loss": 0.0066, "reward": 1.4862499237060547, "reward_std": 0.525466799736023, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.47062498331069946, "step": 3346 }, { "completion_length": 121.5625, "epoch": 1.7907972177635099, "grad_norm": 87.08750915527344, "kl": 0.5757355690002441, "learning_rate": 2.0924733779018578e-06, "loss": 0.023, "reward": 1.9976249933242798, "reward_std": 0.7711588144302368, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4663749933242798, "step": 3347 }, { "completion_length": 138.9375, "epoch": 1.7913322632423756, "grad_norm": 1.0537813901901245, "kl": 0.2236432433128357, "learning_rate": 2.090937799193793e-06, "loss": 0.0089, "reward": 1.9361250400543213, "reward_std": 0.8289132118225098, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4829999804496765, "step": 3348 }, { "completion_length": 136.3125, "epoch": 1.7918673087212413, "grad_norm": 5.336506366729736, "kl": 0.5247191786766052, "learning_rate": 2.089402379046077e-06, "loss": 0.021, "reward": 2.255906343460083, "reward_std": 0.8188762664794922, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.42778125405311584, "step": 3349 }, { "completion_length": 128.15625, "epoch": 1.792402354200107, "grad_norm": 1.1719882488250732, "kl": 0.18368220329284668, "learning_rate": 2.087867118053867e-06, "loss": 0.0073, "reward": 2.123718738555908, "reward_std": 1.0008296966552734, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4830937385559082, "step": 3350 }, { "completion_length": 134.03125, "epoch": 1.7929373996789728, "grad_norm": 0.815004825592041, "kl": 0.1691398024559021, "learning_rate": 2.086332016812258e-06, "loss": 0.0068, "reward": 2.11328125, "reward_std": 0.7737371325492859, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.48828125, "step": 3351 }, { "completion_length": 138.625, "epoch": 1.7934724451578385, "grad_norm": 1.495360255241394, "kl": 0.2328631430864334, "learning_rate": 2.0847970759162885e-06, "loss": 0.0093, "reward": 1.6616562604904175, "reward_std": 0.6827013492584229, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4116562604904175, "step": 3352 }, { "completion_length": 125.625, "epoch": 1.7940074906367043, "grad_norm": 0.8766142129898071, "kl": 0.19095298647880554, "learning_rate": 2.0832622959609273e-06, "loss": 0.0076, "reward": 1.921875, "reward_std": 0.6889641284942627, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.5, "step": 3353 }, { "completion_length": 155.0625, "epoch": 1.7945425361155698, "grad_norm": 1.1338647603988647, "kl": 0.14560987055301666, "learning_rate": 2.0817276775410853e-06, "loss": 0.0058, "reward": 1.4001874923706055, "reward_std": 0.6147871017456055, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.43143749237060547, "step": 3354 }, { "completion_length": 120.28125, "epoch": 1.7950775815944355, "grad_norm": 1.3764506578445435, "kl": 0.24709878861904144, "learning_rate": 2.080193221251613e-06, "loss": 0.0099, "reward": 2.609375, "reward_std": 0.6970610022544861, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.5, "step": 3355 }, { "completion_length": 133.0, "epoch": 1.7956126270733013, "grad_norm": 1.3264552354812622, "kl": 0.25510403513908386, "learning_rate": 2.0786589276872913e-06, "loss": 0.0102, "reward": 2.453125, "reward_std": 0.7356289625167847, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.484375, "step": 3356 }, { "completion_length": 131.09375, "epoch": 1.7961476725521668, "grad_norm": 1.8634154796600342, "kl": 0.16487029194831848, "learning_rate": 2.0771247974428437e-06, "loss": 0.0066, "reward": 1.5432499647140503, "reward_std": 0.5453180074691772, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4651249945163727, "step": 3357 }, { "completion_length": 133.84375, "epoch": 1.7966827180310325, "grad_norm": 1.113827109336853, "kl": 0.203791081905365, "learning_rate": 2.0755908311129277e-06, "loss": 0.0082, "reward": 2.515625, "reward_std": 0.9251773357391357, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.5, "step": 3358 }, { "completion_length": 130.375, "epoch": 1.7972177635098983, "grad_norm": 105.99542236328125, "kl": 0.7965074181556702, "learning_rate": 2.074057029292138e-06, "loss": 0.0319, "reward": 2.081718683242798, "reward_std": 0.6496001482009888, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4723437428474426, "step": 3359 }, { "completion_length": 130.0625, "epoch": 1.797752808988764, "grad_norm": 0.880437970161438, "kl": 0.25201642513275146, "learning_rate": 2.0725233925750064e-06, "loss": 0.0101, "reward": 1.8006563186645508, "reward_std": 0.7911609411239624, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.488156259059906, "step": 3360 }, { "completion_length": 116.125, "epoch": 1.7982878544676297, "grad_norm": 1.2760027647018433, "kl": 0.24139904975891113, "learning_rate": 2.070989921555999e-06, "loss": 0.0097, "reward": 1.901937484741211, "reward_std": 0.9416423439979553, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4175625145435333, "step": 3361 }, { "completion_length": 123.875, "epoch": 1.7988228999464955, "grad_norm": 0.9694662690162659, "kl": 0.21342967450618744, "learning_rate": 2.0694566168295197e-06, "loss": 0.0085, "reward": 1.9226562976837158, "reward_std": 0.4873490035533905, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.45390623807907104, "step": 3362 }, { "completion_length": 132.5625, "epoch": 1.7993579454253612, "grad_norm": 0.6373264789581299, "kl": 0.16791176795959473, "learning_rate": 2.0679234789899054e-06, "loss": 0.0067, "reward": 1.686500072479248, "reward_std": 0.8653311133384705, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48337501287460327, "step": 3363 }, { "completion_length": 115.96875, "epoch": 1.799892990904227, "grad_norm": 1.2709633111953735, "kl": 0.16344605386257172, "learning_rate": 2.06639050863143e-06, "loss": 0.0065, "reward": 1.7342500686645508, "reward_std": 0.4519107937812805, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.499875009059906, "step": 3364 }, { "completion_length": 147.46875, "epoch": 1.8004280363830927, "grad_norm": 1.3556904792785645, "kl": 0.22066862881183624, "learning_rate": 2.0648577063483038e-06, "loss": 0.0088, "reward": 2.7055625915527344, "reward_std": 1.041099190711975, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4711875021457672, "step": 3365 }, { "completion_length": 130.90625, "epoch": 1.8009630818619584, "grad_norm": 0.6156366467475891, "kl": 0.1609765738248825, "learning_rate": 2.0633250727346678e-06, "loss": 0.0064, "reward": 1.9237186908721924, "reward_std": 0.7062255144119263, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.45496875047683716, "step": 3366 }, { "completion_length": 131.96875, "epoch": 1.801498127340824, "grad_norm": 1.8346097469329834, "kl": 0.1638614535331726, "learning_rate": 2.061792608384603e-06, "loss": 0.0066, "reward": 1.9347813129425049, "reward_std": 0.816441535949707, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4816562533378601, "step": 3367 }, { "completion_length": 139.71875, "epoch": 1.8020331728196897, "grad_norm": 5.5820536613464355, "kl": 0.20300298929214478, "learning_rate": 2.0602603138921187e-06, "loss": 0.0081, "reward": 2.3899688720703125, "reward_std": 0.6263187527656555, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46809375286102295, "step": 3368 }, { "completion_length": 132.5625, "epoch": 1.8025682182985554, "grad_norm": 1.3773713111877441, "kl": 0.25588712096214294, "learning_rate": 2.058728189851164e-06, "loss": 0.0102, "reward": 2.008531093597412, "reward_std": 0.6026830673217773, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.49290624260902405, "step": 3369 }, { "completion_length": 123.40625, "epoch": 1.803103263777421, "grad_norm": 0.8287779092788696, "kl": 0.18223866820335388, "learning_rate": 2.0571962368556196e-06, "loss": 0.0073, "reward": 2.291749954223633, "reward_std": 0.8093276619911194, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.4948750138282776, "step": 3370 }, { "completion_length": 126.5, "epoch": 1.8036383092562867, "grad_norm": 0.5143802762031555, "kl": 0.1646592915058136, "learning_rate": 2.0556644554992975e-06, "loss": 0.0066, "reward": 2.132218837738037, "reward_std": 0.5497698783874512, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46034374833106995, "step": 3371 }, { "completion_length": 120.03125, "epoch": 1.8041733547351524, "grad_norm": 1.6907033920288086, "kl": 0.3099316954612732, "learning_rate": 2.0541328463759487e-06, "loss": 0.0124, "reward": 1.9701249599456787, "reward_std": 0.37016376852989197, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4857499897480011, "step": 3372 }, { "completion_length": 116.09375, "epoch": 1.8047084002140181, "grad_norm": 1.6867375373840332, "kl": 0.20996622741222382, "learning_rate": 2.0526014100792517e-06, "loss": 0.0084, "reward": 2.5485000610351562, "reward_std": 0.7482073307037354, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4703750014305115, "step": 3373 }, { "completion_length": 151.1875, "epoch": 1.8052434456928839, "grad_norm": 1.4401739835739136, "kl": 0.16065920889377594, "learning_rate": 2.0510701472028207e-06, "loss": 0.0064, "reward": 1.72084379196167, "reward_std": 0.9600437879562378, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.40834373235702515, "step": 3374 }, { "completion_length": 140.53125, "epoch": 1.8057784911717496, "grad_norm": 1.2299262285232544, "kl": 0.17673993110656738, "learning_rate": 2.049539058340205e-06, "loss": 0.0071, "reward": 2.0691561698913574, "reward_std": 0.8368942737579346, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4129062592983246, "step": 3375 }, { "completion_length": 138.46875, "epoch": 1.8063135366506153, "grad_norm": 5379.66357421875, "kl": 291.7844543457031, "learning_rate": 2.0480081440848816e-06, "loss": 11.6714, "reward": 1.908593773841858, "reward_std": 0.547921895980835, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4554687440395355, "step": 3376 }, { "completion_length": 120.84375, "epoch": 1.806848582129481, "grad_norm": 1.2931511402130127, "kl": 0.1783081293106079, "learning_rate": 2.0464774050302627e-06, "loss": 0.0071, "reward": 2.0215001106262207, "reward_std": 0.7638375759124756, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.47462499141693115, "step": 3377 }, { "completion_length": 122.03125, "epoch": 1.8073836276083468, "grad_norm": 0.6292961835861206, "kl": 0.16176800429821014, "learning_rate": 2.0449468417696926e-06, "loss": 0.0065, "reward": 2.364375114440918, "reward_std": 0.2595663368701935, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4893749952316284, "step": 3378 }, { "completion_length": 141.5, "epoch": 1.8079186730872125, "grad_norm": 0.9154757857322693, "kl": 0.21275870501995087, "learning_rate": 2.043416454896446e-06, "loss": 0.0085, "reward": 1.342437505722046, "reward_std": 0.5400251150131226, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4518125057220459, "step": 3379 }, { "completion_length": 142.71875, "epoch": 1.808453718566078, "grad_norm": 927.6472778320312, "kl": 2.2189462184906006, "learning_rate": 2.0418862450037315e-06, "loss": 0.0888, "reward": 1.7745000123977661, "reward_std": 0.554560661315918, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.3994999825954437, "step": 3380 }, { "completion_length": 121.5, "epoch": 1.8089887640449438, "grad_norm": 0.8396779894828796, "kl": 0.1978977620601654, "learning_rate": 2.0403562126846863e-06, "loss": 0.0079, "reward": 2.51953125, "reward_std": 0.7380009889602661, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48828125, "step": 3381 }, { "completion_length": 135.09375, "epoch": 1.8095238095238095, "grad_norm": 7.103418350219727, "kl": 0.20117498934268951, "learning_rate": 2.038826358532382e-06, "loss": 0.008, "reward": 2.644124984741211, "reward_std": 0.5882741212844849, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.47224998474121094, "step": 3382 }, { "completion_length": 143.8125, "epoch": 1.8100588550026753, "grad_norm": 1.7937222719192505, "kl": 0.5083631277084351, "learning_rate": 2.037296683139818e-06, "loss": 0.0203, "reward": 2.3415937423706055, "reward_std": 0.6539431810379028, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.46659374237060547, "step": 3383 }, { "completion_length": 157.8125, "epoch": 1.8105939004815408, "grad_norm": 6.645075798034668, "kl": 0.6946445107460022, "learning_rate": 2.035767187099926e-06, "loss": 0.0278, "reward": 2.0395936965942383, "reward_std": 1.0102009773254395, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.43021875619888306, "step": 3384 }, { "completion_length": 124.84375, "epoch": 1.8111289459604065, "grad_norm": 4.7270097732543945, "kl": 0.5432623624801636, "learning_rate": 2.0342378710055686e-06, "loss": 0.0217, "reward": 1.6328125, "reward_std": 0.7188748121261597, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4765625, "step": 3385 }, { "completion_length": 126.0, "epoch": 1.8116639914392723, "grad_norm": 1.7183809280395508, "kl": 0.20619532465934753, "learning_rate": 2.0327087354495374e-06, "loss": 0.0082, "reward": 1.6567187309265137, "reward_std": 0.599938154220581, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.45359376072883606, "step": 3386 }, { "completion_length": 151.125, "epoch": 1.812199036918138, "grad_norm": 2.224799156188965, "kl": 0.16752387583255768, "learning_rate": 2.0311797810245546e-06, "loss": 0.0067, "reward": 2.067812442779541, "reward_std": 0.788886308670044, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4428125023841858, "step": 3387 }, { "completion_length": 114.5, "epoch": 1.8127340823970037, "grad_norm": 1.0375330448150635, "kl": 0.21328812837600708, "learning_rate": 2.029651008323273e-06, "loss": 0.0085, "reward": 1.991781234741211, "reward_std": 0.872887134552002, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.47615623474121094, "step": 3388 }, { "completion_length": 142.25, "epoch": 1.8132691278758695, "grad_norm": 1.4109317064285278, "kl": 0.2149961292743683, "learning_rate": 2.0281224179382737e-06, "loss": 0.0086, "reward": 2.2466249465942383, "reward_std": 0.8521532416343689, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.46537500619888306, "step": 3389 }, { "completion_length": 125.375, "epoch": 1.8138041733547352, "grad_norm": 14.021418571472168, "kl": 1.0223524570465088, "learning_rate": 2.0265940104620678e-06, "loss": 0.0409, "reward": 2.296875, "reward_std": 0.3516511619091034, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.5, "step": 3390 }, { "completion_length": 99.375, "epoch": 1.814339218833601, "grad_norm": 1.0264246463775635, "kl": 0.18960057199001312, "learning_rate": 2.0250657864870944e-06, "loss": 0.0076, "reward": 2.683687448501587, "reward_std": 0.5245988368988037, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4961875081062317, "step": 3391 }, { "completion_length": 129.875, "epoch": 1.8148742643124667, "grad_norm": 1.6079397201538086, "kl": 0.2077820897102356, "learning_rate": 2.023537746605723e-06, "loss": 0.0083, "reward": 2.042562484741211, "reward_std": 0.9761420488357544, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.44881248474121094, "step": 3392 }, { "completion_length": 134.5, "epoch": 1.8154093097913324, "grad_norm": 2.7720067501068115, "kl": 0.39187997579574585, "learning_rate": 2.0220098914102526e-06, "loss": 0.0157, "reward": 2.138625144958496, "reward_std": 0.9645323753356934, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.43549999594688416, "step": 3393 }, { "completion_length": 147.25, "epoch": 1.815944355270198, "grad_norm": 0.7474210262298584, "kl": 0.24123631417751312, "learning_rate": 2.0204822214929064e-06, "loss": 0.0096, "reward": 1.194718837738037, "reward_std": 0.7788389921188354, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.39784377813339233, "step": 3394 }, { "completion_length": 145.6875, "epoch": 1.8164794007490637, "grad_norm": 1.1177396774291992, "kl": 0.1883622407913208, "learning_rate": 2.0189547374458414e-06, "loss": 0.0075, "reward": 1.9501875638961792, "reward_std": 0.9008578658103943, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4345625042915344, "step": 3395 }, { "completion_length": 105.65625, "epoch": 1.8170144462279294, "grad_norm": 1.581809639930725, "kl": 0.26933300495147705, "learning_rate": 2.017427439861137e-06, "loss": 0.0108, "reward": 3.0141875743865967, "reward_std": 0.7274838089942932, "rewards/correctness_reward_func": 1.5625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4985625147819519, "step": 3396 }, { "completion_length": 153.8125, "epoch": 1.817549491706795, "grad_norm": 1.1539467573165894, "kl": 0.14350095391273499, "learning_rate": 2.015900329330804e-06, "loss": 0.0057, "reward": 2.1070938110351562, "reward_std": 0.4684379994869232, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4352187514305115, "step": 3397 }, { "completion_length": 139.59375, "epoch": 1.8180845371856607, "grad_norm": 0.8450430631637573, "kl": 0.1491217017173767, "learning_rate": 2.014373406446781e-06, "loss": 0.006, "reward": 2.0136561393737793, "reward_std": 0.7622371912002563, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.43553125858306885, "step": 3398 }, { "completion_length": 127.53125, "epoch": 1.8186195826645264, "grad_norm": 1.4901738166809082, "kl": 0.4695310592651367, "learning_rate": 2.012846671800931e-06, "loss": 0.0188, "reward": 1.4584062099456787, "reward_std": 0.7141962051391602, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4427812397480011, "step": 3399 }, { "completion_length": 150.78125, "epoch": 1.8191546281433921, "grad_norm": 0.9157888889312744, "kl": 0.17447341978549957, "learning_rate": 2.011320125985047e-06, "loss": 0.007, "reward": 1.1887500286102295, "reward_std": 0.5361392498016357, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4231249988079071, "step": 3400 }, { "completion_length": 129.3125, "epoch": 1.8196896736222579, "grad_norm": 0.651142418384552, "kl": 0.1755058467388153, "learning_rate": 2.0097937695908457e-06, "loss": 0.007, "reward": 2.312375068664551, "reward_std": 0.961275577545166, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.499875009059906, "step": 3401 }, { "completion_length": 138.0, "epoch": 1.8202247191011236, "grad_norm": 1.5669537782669067, "kl": 0.20503365993499756, "learning_rate": 2.0082676032099725e-06, "loss": 0.0082, "reward": 1.671875, "reward_std": 0.38553935289382935, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46875, "step": 3402 }, { "completion_length": 152.0625, "epoch": 1.8207597645799893, "grad_norm": 0.9247594475746155, "kl": 0.21740615367889404, "learning_rate": 2.0067416274340003e-06, "loss": 0.0087, "reward": 1.8430625200271606, "reward_std": 0.5328303575515747, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.45243752002716064, "step": 3403 }, { "completion_length": 141.96875, "epoch": 1.821294810058855, "grad_norm": 1.4967339038848877, "kl": 0.1826796978712082, "learning_rate": 2.0052158428544245e-06, "loss": 0.0073, "reward": 1.7102500200271606, "reward_std": 0.7899705171585083, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.44462499022483826, "step": 3404 }, { "completion_length": 128.375, "epoch": 1.8218298555377208, "grad_norm": 1.7166093587875366, "kl": 0.2429780811071396, "learning_rate": 2.0036902500626702e-06, "loss": 0.0097, "reward": 1.3999687433242798, "reward_std": 0.6937925815582275, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4155937433242798, "step": 3405 }, { "completion_length": 140.0, "epoch": 1.8223649010165865, "grad_norm": 1.447416067123413, "kl": 0.184524804353714, "learning_rate": 2.002164849650085e-06, "loss": 0.0074, "reward": 1.7463749647140503, "reward_std": 0.87271648645401, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4651249945163727, "step": 3406 }, { "completion_length": 146.375, "epoch": 1.822899946495452, "grad_norm": 1.0681174993515015, "kl": 0.18492163717746735, "learning_rate": 2.000639642207944e-06, "loss": 0.0074, "reward": 1.7410625219345093, "reward_std": 0.8321977853775024, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.4285624921321869, "step": 3407 }, { "completion_length": 141.46875, "epoch": 1.8234349919743178, "grad_norm": 1.5261532068252563, "kl": 0.19102835655212402, "learning_rate": 1.9991146283274466e-06, "loss": 0.0076, "reward": 1.4391562938690186, "reward_std": 0.5228379368782043, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4235312342643738, "step": 3408 }, { "completion_length": 129.625, "epoch": 1.8239700374531835, "grad_norm": 1.1331303119659424, "kl": 0.17691877484321594, "learning_rate": 1.997589808599718e-06, "loss": 0.0071, "reward": 2.705437660217285, "reward_std": 0.8353490829467773, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4710625112056732, "step": 3409 }, { "completion_length": 145.28125, "epoch": 1.8245050829320493, "grad_norm": 2.5864098072052, "kl": 0.4779360592365265, "learning_rate": 1.9960651836158068e-06, "loss": 0.0191, "reward": 2.01953125, "reward_std": 0.9381868839263916, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48828125, "step": 3410 }, { "completion_length": 138.9375, "epoch": 1.8250401284109148, "grad_norm": 0.814712643623352, "kl": 0.1850488781929016, "learning_rate": 1.994540753966687e-06, "loss": 0.0074, "reward": 1.8922499418258667, "reward_std": 0.6998693943023682, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4703750014305115, "step": 3411 }, { "completion_length": 124.9375, "epoch": 1.8255751738897805, "grad_norm": 1.7351183891296387, "kl": 0.2507445216178894, "learning_rate": 1.9930165202432564e-06, "loss": 0.01, "reward": 2.0755937099456787, "reward_std": 0.8983553051948547, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4818437397480011, "step": 3412 }, { "completion_length": 127.4375, "epoch": 1.8261102193686463, "grad_norm": 1.5428133010864258, "kl": 0.23985223472118378, "learning_rate": 1.991492483036339e-06, "loss": 0.0096, "reward": 2.1746249198913574, "reward_std": 0.7688544988632202, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4871250092983246, "step": 3413 }, { "completion_length": 137.625, "epoch": 1.826645264847512, "grad_norm": 28.982837677001953, "kl": 1.4285426139831543, "learning_rate": 1.9899686429366782e-06, "loss": 0.0571, "reward": 2.52734375, "reward_std": 0.8417401313781738, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46484375, "step": 3414 }, { "completion_length": 136.40625, "epoch": 1.8271803103263777, "grad_norm": 1.5627285242080688, "kl": 0.20945781469345093, "learning_rate": 1.9884450005349467e-06, "loss": 0.0084, "reward": 1.9669687747955322, "reward_std": 1.0766077041625977, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.45134374499320984, "step": 3415 }, { "completion_length": 117.46875, "epoch": 1.8277153558052435, "grad_norm": 0.6524099707603455, "kl": 0.19140684604644775, "learning_rate": 1.986921556421734e-06, "loss": 0.0077, "reward": 2.1520938873291016, "reward_std": 0.3745886981487274, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4802187383174896, "step": 3416 }, { "completion_length": 131.3125, "epoch": 1.8282504012841092, "grad_norm": 1.829362154006958, "kl": 0.15843211114406586, "learning_rate": 1.985398311187558e-06, "loss": 0.0063, "reward": 1.5078125, "reward_std": 0.666978120803833, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4765625, "step": 3417 }, { "completion_length": 143.96875, "epoch": 1.828785446762975, "grad_norm": 1.0169612169265747, "kl": 0.15381431579589844, "learning_rate": 1.983875265422859e-06, "loss": 0.0062, "reward": 1.2781875133514404, "reward_std": 0.7400723695755005, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.43443751335144043, "step": 3418 }, { "completion_length": 131.90625, "epoch": 1.8293204922418407, "grad_norm": 0.7239803075790405, "kl": 0.16768184304237366, "learning_rate": 1.9823524197179965e-06, "loss": 0.0067, "reward": 1.722406268119812, "reward_std": 0.6553534269332886, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.472406268119812, "step": 3419 }, { "completion_length": 134.84375, "epoch": 1.8298555377207064, "grad_norm": 0.9058874249458313, "kl": 0.15453992784023285, "learning_rate": 1.980829774663256e-06, "loss": 0.0062, "reward": 2.5707812309265137, "reward_std": 1.1761672496795654, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.44578126072883606, "step": 3420 }, { "completion_length": 125.1875, "epoch": 1.830390583199572, "grad_norm": 0.6487913727760315, "kl": 0.18412956595420837, "learning_rate": 1.9793073308488426e-06, "loss": 0.0074, "reward": 1.7147188186645508, "reward_std": 0.4919261932373047, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.480343759059906, "step": 3421 }, { "completion_length": 151.09375, "epoch": 1.8309256286784377, "grad_norm": 1.9014763832092285, "kl": 0.1627037078142166, "learning_rate": 1.9777850888648863e-06, "loss": 0.0065, "reward": 1.1710937023162842, "reward_std": 0.5116366147994995, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.40546876192092896, "step": 3422 }, { "completion_length": 121.6875, "epoch": 1.8314606741573034, "grad_norm": 0.7378205060958862, "kl": 0.18815848231315613, "learning_rate": 1.9762630493014367e-06, "loss": 0.0075, "reward": 2.2964999675750732, "reward_std": 0.8473688960075378, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.49962499737739563, "step": 3423 }, { "completion_length": 130.21875, "epoch": 1.831995719636169, "grad_norm": 1.881427526473999, "kl": 0.2199452519416809, "learning_rate": 1.974741212748465e-06, "loss": 0.0088, "reward": 2.4667811393737793, "reward_std": 0.8302074074745178, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46678125858306885, "step": 3424 }, { "completion_length": 148.125, "epoch": 1.8325307651150347, "grad_norm": 0.5458316206932068, "kl": 0.1524156630039215, "learning_rate": 1.973219579795864e-06, "loss": 0.0061, "reward": 1.984375, "reward_std": 0.6731314659118652, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.46875, "step": 3425 }, { "completion_length": 123.25, "epoch": 1.8330658105939004, "grad_norm": 9.35759449005127, "kl": 0.24421584606170654, "learning_rate": 1.97169815103345e-06, "loss": 0.0098, "reward": 2.691281318664551, "reward_std": 0.6016330122947693, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.488156259059906, "step": 3426 }, { "completion_length": 134.875, "epoch": 1.8336008560727661, "grad_norm": 0.8821380138397217, "kl": 0.19007623195648193, "learning_rate": 1.970176927050955e-06, "loss": 0.0076, "reward": 2.05078125, "reward_std": 0.8190844655036926, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48828125, "step": 3427 }, { "completion_length": 142.15625, "epoch": 1.8341359015516319, "grad_norm": 5.345384120941162, "kl": 0.21504876017570496, "learning_rate": 1.9686559084380363e-06, "loss": 0.0086, "reward": 1.565250039100647, "reward_std": 0.6652154922485352, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4402500092983246, "step": 3428 }, { "completion_length": 127.71875, "epoch": 1.8346709470304976, "grad_norm": 0.5995127558708191, "kl": 0.17122751474380493, "learning_rate": 1.967135095784269e-06, "loss": 0.0068, "reward": 1.6997811794281006, "reward_std": 0.6420401334762573, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46540623903274536, "step": 3429 }, { "completion_length": 128.65625, "epoch": 1.8352059925093633, "grad_norm": 0.7568969130516052, "kl": 0.17857997119426727, "learning_rate": 1.96561448967915e-06, "loss": 0.0071, "reward": 2.1711249351501465, "reward_std": 1.0684224367141724, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.45237502455711365, "step": 3430 }, { "completion_length": 119.625, "epoch": 1.835741037988229, "grad_norm": 1.2719467878341675, "kl": 0.21071118116378784, "learning_rate": 1.9640940907120954e-06, "loss": 0.0084, "reward": 1.9364062547683716, "reward_std": 0.8401800394058228, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4832812547683716, "step": 3431 }, { "completion_length": 139.03125, "epoch": 1.8362760834670948, "grad_norm": 0.6772851943969727, "kl": 0.13414223492145538, "learning_rate": 1.96257389947244e-06, "loss": 0.0054, "reward": 1.6915311813354492, "reward_std": 0.23801672458648682, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.425906240940094, "step": 3432 }, { "completion_length": 131.78125, "epoch": 1.8368111289459605, "grad_norm": 1.262151837348938, "kl": 0.20077237486839294, "learning_rate": 1.96105391654944e-06, "loss": 0.008, "reward": 2.3786873817443848, "reward_std": 0.9192217588424683, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4568125009536743, "step": 3433 }, { "completion_length": 119.875, "epoch": 1.837346174424826, "grad_norm": 1.1577694416046143, "kl": 0.19858106970787048, "learning_rate": 1.9595341425322696e-06, "loss": 0.0079, "reward": 1.9058125019073486, "reward_std": 0.759158730506897, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.49956250190734863, "step": 3434 }, { "completion_length": 144.21875, "epoch": 1.8378812199036918, "grad_norm": 21.385602951049805, "kl": 1.1045384407043457, "learning_rate": 1.958014578010022e-06, "loss": 0.0442, "reward": 1.756906270980835, "reward_std": 0.7463876008987427, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4444062411785126, "step": 3435 }, { "completion_length": 140.46875, "epoch": 1.8384162653825575, "grad_norm": 2.5607807636260986, "kl": 0.2525857985019684, "learning_rate": 1.95649522357171e-06, "loss": 0.0101, "reward": 1.9266562461853027, "reward_std": 0.93751060962677, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47353124618530273, "step": 3436 }, { "completion_length": 125.375, "epoch": 1.8389513108614233, "grad_norm": 1.1480873823165894, "kl": 0.19146311283111572, "learning_rate": 1.954976079806264e-06, "loss": 0.0077, "reward": 1.8718750476837158, "reward_std": 0.31720542907714844, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.49687498807907104, "step": 3437 }, { "completion_length": 130.59375, "epoch": 1.8394863563402888, "grad_norm": 2.194333076477051, "kl": 0.25055617094039917, "learning_rate": 1.9534571473025344e-06, "loss": 0.01, "reward": 2.528656244277954, "reward_std": 1.1455366611480713, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4661562442779541, "step": 3438 }, { "completion_length": 120.8125, "epoch": 1.8400214018191545, "grad_norm": 1.9595997333526611, "kl": 0.17288079857826233, "learning_rate": 1.9519384266492865e-06, "loss": 0.0069, "reward": 2.7401561737060547, "reward_std": 0.4657626748085022, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.45890623331069946, "step": 3439 }, { "completion_length": 127.03125, "epoch": 1.8405564472980203, "grad_norm": 1.9573240280151367, "kl": 0.36404693126678467, "learning_rate": 1.9504199184352073e-06, "loss": 0.0146, "reward": 2.613062620162964, "reward_std": 0.594491720199585, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4724375009536743, "step": 3440 }, { "completion_length": 150.625, "epoch": 1.841091492776886, "grad_norm": 1.0238255262374878, "kl": 0.1441822350025177, "learning_rate": 1.9489016232489e-06, "loss": 0.0058, "reward": 1.5881562232971191, "reward_std": 0.8365652561187744, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.44753125309944153, "step": 3441 }, { "completion_length": 134.625, "epoch": 1.8416265382557517, "grad_norm": 0.8310756087303162, "kl": 0.20543429255485535, "learning_rate": 1.9473835416788826e-06, "loss": 0.0082, "reward": 1.946906328201294, "reward_std": 0.746194064617157, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.46253126859664917, "step": 3442 }, { "completion_length": 141.59375, "epoch": 1.8421615837346175, "grad_norm": 2.4350054264068604, "kl": 0.15996450185775757, "learning_rate": 1.945865674313595e-06, "loss": 0.0064, "reward": 2.275343894958496, "reward_std": 0.7187216281890869, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.47846874594688416, "step": 3443 }, { "completion_length": 120.34375, "epoch": 1.8426966292134832, "grad_norm": 1.5283197164535522, "kl": 0.1995246410369873, "learning_rate": 1.9443480217413903e-06, "loss": 0.008, "reward": 2.484375, "reward_std": 0.9111011028289795, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.484375, "step": 3444 }, { "completion_length": 128.65625, "epoch": 1.843231674692349, "grad_norm": 0.5275474190711975, "kl": 0.13038136065006256, "learning_rate": 1.94283058455054e-06, "loss": 0.0052, "reward": 2.621875047683716, "reward_std": 0.5158600211143494, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48124998807907104, "step": 3445 }, { "completion_length": 108.4375, "epoch": 1.8437667201712147, "grad_norm": 0.9707061052322388, "kl": 0.20328941941261292, "learning_rate": 1.941313363329232e-06, "loss": 0.0081, "reward": 2.233062505722046, "reward_std": 0.7807630300521851, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4830625057220459, "step": 3446 }, { "completion_length": 120.0, "epoch": 1.8443017656500804, "grad_norm": 0.9752534031867981, "kl": 0.18753866851329803, "learning_rate": 1.9397963586655703e-06, "loss": 0.0075, "reward": 2.4313437938690186, "reward_std": 0.4796733856201172, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.49384376406669617, "step": 3447 }, { "completion_length": 131.90625, "epoch": 1.844836811128946, "grad_norm": 0.7690185308456421, "kl": 0.157904714345932, "learning_rate": 1.938279571147575e-06, "loss": 0.0063, "reward": 1.6395937204360962, "reward_std": 0.6581833362579346, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4520937502384186, "step": 3448 }, { "completion_length": 116.25, "epoch": 1.8453718566078117, "grad_norm": 2.786987543106079, "kl": 0.43894222378730774, "learning_rate": 1.936763001363182e-06, "loss": 0.0176, "reward": 1.9410624504089355, "reward_std": 0.6771551370620728, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4410625100135803, "step": 3449 }, { "completion_length": 136.875, "epoch": 1.8459069020866774, "grad_norm": 0.9508014917373657, "kl": 0.1728314310312271, "learning_rate": 1.935246649900242e-06, "loss": 0.0069, "reward": 2.03125, "reward_std": 0.7403918504714966, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.453125, "step": 3450 }, { "completion_length": 134.78125, "epoch": 1.846441947565543, "grad_norm": 191.33975219726562, "kl": 7.182339191436768, "learning_rate": 1.933730517346524e-06, "loss": 0.2873, "reward": 1.9330313205718994, "reward_std": 0.9676427245140076, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46428126096725464, "step": 3451 }, { "completion_length": 140.59375, "epoch": 1.8469769930444087, "grad_norm": 2.366748332977295, "kl": 0.27089807391166687, "learning_rate": 1.9322146042897076e-06, "loss": 0.0108, "reward": 1.928531289100647, "reward_std": 0.6857880353927612, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4597812592983246, "step": 3452 }, { "completion_length": 134.46875, "epoch": 1.8475120385232744, "grad_norm": 1.325835943222046, "kl": 0.21340411901474, "learning_rate": 1.9306989113173917e-06, "loss": 0.0085, "reward": 1.6379063129425049, "reward_std": 0.7127118706703186, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4347812533378601, "step": 3453 }, { "completion_length": 128.5625, "epoch": 1.8480470840021401, "grad_norm": 0.8383083343505859, "kl": 0.16859525442123413, "learning_rate": 1.9291834390170866e-06, "loss": 0.0067, "reward": 2.120968818664551, "reward_std": 1.0872247219085693, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.417843759059906, "step": 3454 }, { "completion_length": 132.71875, "epoch": 1.8485821294810059, "grad_norm": 0.920531153678894, "kl": 0.1974782645702362, "learning_rate": 1.9276681879762187e-06, "loss": 0.0079, "reward": 1.904687523841858, "reward_std": 0.7397534847259521, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4515624940395355, "step": 3455 }, { "completion_length": 119.71875, "epoch": 1.8491171749598716, "grad_norm": 0.7592769265174866, "kl": 0.20633047819137573, "learning_rate": 1.926153158782129e-06, "loss": 0.0083, "reward": 2.309187412261963, "reward_std": 1.1415424346923828, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48106250166893005, "step": 3456 }, { "completion_length": 120.1875, "epoch": 1.8496522204387373, "grad_norm": 0.8863000869750977, "kl": 0.21122823655605316, "learning_rate": 1.9246383520220707e-06, "loss": 0.0084, "reward": 1.299187421798706, "reward_std": 0.5766465067863464, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4554375112056732, "step": 3457 }, { "completion_length": 145.84375, "epoch": 1.850187265917603, "grad_norm": 1.0984091758728027, "kl": 0.17373114824295044, "learning_rate": 1.9231237682832128e-06, "loss": 0.0069, "reward": 1.6610312461853027, "reward_std": 0.5280697345733643, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.45790624618530273, "step": 3458 }, { "completion_length": 139.5, "epoch": 1.8507223113964688, "grad_norm": 2.9792659282684326, "kl": 0.19328436255455017, "learning_rate": 1.9216094081526372e-06, "loss": 0.0077, "reward": 1.4477499723434448, "reward_std": 0.87042635679245, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4008750021457672, "step": 3459 }, { "completion_length": 116.4375, "epoch": 1.8512573568753345, "grad_norm": 0.825620174407959, "kl": 0.1977163851261139, "learning_rate": 1.920095272217338e-06, "loss": 0.0079, "reward": 2.438187599182129, "reward_std": 0.6499149799346924, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48506250977516174, "step": 3460 }, { "completion_length": 137.21875, "epoch": 1.8517924023542, "grad_norm": 0.6105626821517944, "kl": 0.16318754851818085, "learning_rate": 1.9185813610642248e-06, "loss": 0.0065, "reward": 1.8125, "reward_std": 0.16230806708335876, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46875, "step": 3461 }, { "completion_length": 105.875, "epoch": 1.8523274478330658, "grad_norm": 2.300088405609131, "kl": 0.35013359785079956, "learning_rate": 1.917067675280116e-06, "loss": 0.014, "reward": 2.1459686756134033, "reward_std": 0.8444949984550476, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4897187352180481, "step": 3462 }, { "completion_length": 101.96875, "epoch": 1.8528624933119315, "grad_norm": 0.5883427858352661, "kl": 0.2523266673088074, "learning_rate": 1.915554215451747e-06, "loss": 0.0101, "reward": 2.036749839782715, "reward_std": 0.5657237768173218, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4586249887943268, "step": 3463 }, { "completion_length": 133.875, "epoch": 1.8533975387907973, "grad_norm": 0.7625765800476074, "kl": 0.19099470973014832, "learning_rate": 1.9140409821657652e-06, "loss": 0.0076, "reward": 1.90443754196167, "reward_std": 0.6184127330780029, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.45131251215934753, "step": 3464 }, { "completion_length": 125.84375, "epoch": 1.8539325842696628, "grad_norm": 2.4668831825256348, "kl": 0.33903515338897705, "learning_rate": 1.912527976008725e-06, "loss": 0.0136, "reward": 1.7319375276565552, "reward_std": 0.7917971014976501, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.4819374978542328, "step": 3465 }, { "completion_length": 134.1875, "epoch": 1.8544676297485285, "grad_norm": 0.9118902087211609, "kl": 0.17817991971969604, "learning_rate": 1.911015197567101e-06, "loss": 0.0071, "reward": 2.086124897003174, "reward_std": 0.8110368251800537, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4611250162124634, "step": 3466 }, { "completion_length": 102.1875, "epoch": 1.8550026752273943, "grad_norm": 0.7858413457870483, "kl": 0.19024503231048584, "learning_rate": 1.9095026474272714e-06, "loss": 0.0076, "reward": 2.9093124866485596, "reward_std": 0.8627398014068604, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.47181251645088196, "step": 3467 }, { "completion_length": 125.0625, "epoch": 1.85553772070626, "grad_norm": 0.6297755241394043, "kl": 0.19497567415237427, "learning_rate": 1.907990326175531e-06, "loss": 0.0078, "reward": 2.200312614440918, "reward_std": 0.7416072487831116, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4815624952316284, "step": 3468 }, { "completion_length": 122.8125, "epoch": 1.8560727661851257, "grad_norm": 0.8491682410240173, "kl": 0.20598196983337402, "learning_rate": 1.906478234398086e-06, "loss": 0.0082, "reward": 2.1664376258850098, "reward_std": 0.5071836709976196, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4476875066757202, "step": 3469 }, { "completion_length": 131.875, "epoch": 1.8566078116639915, "grad_norm": 1.4490761756896973, "kl": 0.31913846731185913, "learning_rate": 1.9049663726810496e-06, "loss": 0.0128, "reward": 1.9606562852859497, "reward_std": 0.5357018709182739, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4137812554836273, "step": 3470 }, { "completion_length": 136.75, "epoch": 1.8571428571428572, "grad_norm": 1.424204707145691, "kl": 0.22892031073570251, "learning_rate": 1.9034547416104504e-06, "loss": 0.0092, "reward": 1.6301562786102295, "reward_std": 0.794776201248169, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4270312488079071, "step": 3471 }, { "completion_length": 136.6875, "epoch": 1.857677902621723, "grad_norm": 4.99000883102417, "kl": 0.3470724821090698, "learning_rate": 1.9019433417722238e-06, "loss": 0.0139, "reward": 1.5644375085830688, "reward_std": 0.9238656759262085, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.40818750858306885, "step": 3472 }, { "completion_length": 122.28125, "epoch": 1.8582129481005887, "grad_norm": 1.0081580877304077, "kl": 0.3396472930908203, "learning_rate": 1.9004321737522186e-06, "loss": 0.0136, "reward": 2.138000011444092, "reward_std": 0.9120299816131592, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4817499816417694, "step": 3473 }, { "completion_length": 126.46875, "epoch": 1.8587479935794544, "grad_norm": 0.6614145040512085, "kl": 0.17235082387924194, "learning_rate": 1.8989212381361926e-06, "loss": 0.0069, "reward": 2.76953125, "reward_std": 0.8127301931381226, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.47265625, "step": 3474 }, { "completion_length": 123.90625, "epoch": 1.85928303905832, "grad_norm": 0.8744609951972961, "kl": 0.177043616771698, "learning_rate": 1.8974105355098122e-06, "loss": 0.0071, "reward": 2.031125068664551, "reward_std": 0.5209128856658936, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.499875009059906, "step": 3475 }, { "completion_length": 102.875, "epoch": 1.8598180845371857, "grad_norm": 0.8688327670097351, "kl": 0.3130752146244049, "learning_rate": 1.8959000664586562e-06, "loss": 0.0125, "reward": 2.27762508392334, "reward_std": 0.5092036128044128, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4807499945163727, "step": 3476 }, { "completion_length": 139.53125, "epoch": 1.8603531300160514, "grad_norm": 25469876.0, "kl": 5775178.5, "learning_rate": 1.89438983156821e-06, "loss": 231007.1406, "reward": 2.125, "reward_std": 0.8224826455116272, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.5, "step": 3477 }, { "completion_length": 113.1875, "epoch": 1.860888175494917, "grad_norm": 0.7145710587501526, "kl": 0.15626955032348633, "learning_rate": 1.89287983142387e-06, "loss": 0.0063, "reward": 2.3602187633514404, "reward_std": 0.5360455513000488, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48521876335144043, "step": 3478 }, { "completion_length": 122.90625, "epoch": 1.8614232209737827, "grad_norm": 1.8019675016403198, "kl": 0.19032035768032074, "learning_rate": 1.8913700666109418e-06, "loss": 0.0076, "reward": 2.3092498779296875, "reward_std": 0.40783926844596863, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.46549999713897705, "step": 3479 }, { "completion_length": 116.9375, "epoch": 1.8619582664526484, "grad_norm": 1.5909547805786133, "kl": 0.24445617198944092, "learning_rate": 1.8898605377146383e-06, "loss": 0.0098, "reward": 2.658468723297119, "reward_std": 0.8538556694984436, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.48659375309944153, "step": 3480 }, { "completion_length": 119.46875, "epoch": 1.8624933119315141, "grad_norm": 1.1568655967712402, "kl": 0.25075769424438477, "learning_rate": 1.8883512453200836e-06, "loss": 0.01, "reward": 1.9747188091278076, "reward_std": 0.9983852505683899, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.49034374952316284, "step": 3481 }, { "completion_length": 122.34375, "epoch": 1.8630283574103799, "grad_norm": 0.8264912366867065, "kl": 0.1624811887741089, "learning_rate": 1.886842190012305e-06, "loss": 0.0065, "reward": 2.3587188720703125, "reward_std": 0.6466577053070068, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48371875286102295, "step": 3482 }, { "completion_length": 132.15625, "epoch": 1.8635634028892456, "grad_norm": 0.5072532296180725, "kl": 0.16304540634155273, "learning_rate": 1.885333372376244e-06, "loss": 0.0065, "reward": 2.49609375, "reward_std": 0.5770800113677979, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46484375, "step": 3483 }, { "completion_length": 123.90625, "epoch": 1.8640984483681113, "grad_norm": 1.5916035175323486, "kl": 0.22360028326511383, "learning_rate": 1.8838247929967477e-06, "loss": 0.0089, "reward": 2.285656452178955, "reward_std": 0.7397060990333557, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.45753124356269836, "step": 3484 }, { "completion_length": 119.65625, "epoch": 1.864633493846977, "grad_norm": 1.0947680473327637, "kl": 0.18940450251102448, "learning_rate": 1.8823164524585675e-06, "loss": 0.0076, "reward": 2.5711874961853027, "reward_std": 0.8744558095932007, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.47743749618530273, "step": 3485 }, { "completion_length": 130.25, "epoch": 1.8651685393258428, "grad_norm": 1.1799798011779785, "kl": 0.25199925899505615, "learning_rate": 1.8808083513463683e-06, "loss": 0.0101, "reward": 1.8401875495910645, "reward_std": 0.6813212633132935, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4808124899864197, "step": 3486 }, { "completion_length": 129.875, "epoch": 1.8657035848047085, "grad_norm": 1.0317585468292236, "kl": 0.20102357864379883, "learning_rate": 1.8793004902447163e-06, "loss": 0.008, "reward": 1.4999375343322754, "reward_std": 0.6648354530334473, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.499937504529953, "step": 3487 }, { "completion_length": 130.09375, "epoch": 1.866238630283574, "grad_norm": 1.1506192684173584, "kl": 0.1743713915348053, "learning_rate": 1.877792869738088e-06, "loss": 0.007, "reward": 1.8129687309265137, "reward_std": 1.1442344188690186, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.43796873092651367, "step": 3488 }, { "completion_length": 139.0625, "epoch": 1.8667736757624398, "grad_norm": 0.8605170845985413, "kl": 0.17884482443332672, "learning_rate": 1.876285490410868e-06, "loss": 0.0072, "reward": 1.7803125381469727, "reward_std": 1.0579296350479126, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4365624785423279, "step": 3489 }, { "completion_length": 107.96875, "epoch": 1.8673087212413055, "grad_norm": 1.2631914615631104, "kl": 0.23605386912822723, "learning_rate": 1.8747783528473428e-06, "loss": 0.0094, "reward": 2.3703436851501465, "reward_std": 0.3117271959781647, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47971874475479126, "step": 3490 }, { "completion_length": 133.125, "epoch": 1.867843766720171, "grad_norm": 0.6250868439674377, "kl": 0.1666889786720276, "learning_rate": 1.8732714576317096e-06, "loss": 0.0067, "reward": 2.01953125, "reward_std": 0.9698755741119385, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47265625, "step": 3491 }, { "completion_length": 130.03125, "epoch": 1.8683788121990368, "grad_norm": 0.6672672629356384, "kl": 0.1841147243976593, "learning_rate": 1.8717648053480686e-06, "loss": 0.0074, "reward": 1.8722813129425049, "reward_std": 0.6281329989433289, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4504062533378601, "step": 3492 }, { "completion_length": 132.6875, "epoch": 1.8689138576779025, "grad_norm": 0.9098107218742371, "kl": 0.15819619596004486, "learning_rate": 1.8702583965804278e-06, "loss": 0.0063, "reward": 2.3068125247955322, "reward_std": 1.195822834968567, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4161875247955322, "step": 3493 }, { "completion_length": 114.6875, "epoch": 1.8694489031567683, "grad_norm": 2.0514307022094727, "kl": 0.23679612576961517, "learning_rate": 1.8687522319127005e-06, "loss": 0.0095, "reward": 2.308468818664551, "reward_std": 0.781851589679718, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.480343759059906, "step": 3494 }, { "completion_length": 154.25, "epoch": 1.869983948635634, "grad_norm": 1.20801842212677, "kl": 0.24461695551872253, "learning_rate": 1.8672463119287037e-06, "loss": 0.0098, "reward": 1.2596561908721924, "reward_std": 0.6792453527450562, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.40028125047683716, "step": 3495 }, { "completion_length": 137.03125, "epoch": 1.8705189941144997, "grad_norm": 0.8705445528030396, "kl": 0.1801183521747589, "learning_rate": 1.865740637212162e-06, "loss": 0.0072, "reward": 2.194218873977661, "reward_std": 0.9871852993965149, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4442187547683716, "step": 3496 }, { "completion_length": 137.96875, "epoch": 1.8710540395933655, "grad_norm": 1.3595960140228271, "kl": 0.15662150084972382, "learning_rate": 1.8642352083467036e-06, "loss": 0.0063, "reward": 1.6107499599456787, "reward_std": 0.5285372138023376, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4857499897480011, "step": 3497 }, { "completion_length": 122.875, "epoch": 1.8715890850722312, "grad_norm": 1.2741177082061768, "kl": 0.19403517246246338, "learning_rate": 1.8627300259158609e-06, "loss": 0.0078, "reward": 2.0229063034057617, "reward_std": 0.7598572969436646, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47603124380111694, "step": 3498 }, { "completion_length": 128.34375, "epoch": 1.872124130551097, "grad_norm": 1.5510082244873047, "kl": 0.1693679541349411, "learning_rate": 1.8612250905030726e-06, "loss": 0.0068, "reward": 2.1872811317443848, "reward_std": 1.0888433456420898, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4372812509536743, "step": 3499 }, { "completion_length": 112.59375, "epoch": 1.8726591760299627, "grad_norm": 0.4231148064136505, "kl": 0.19202345609664917, "learning_rate": 1.8597204026916791e-06, "loss": 0.0077, "reward": 2.097062587738037, "reward_std": 0.5640017986297607, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.48768749833106995, "step": 3500 }, { "completion_length": 141.21875, "epoch": 1.8731942215088284, "grad_norm": 0.5970994830131531, "kl": 0.17009669542312622, "learning_rate": 1.858215963064927e-06, "loss": 0.0068, "reward": 2.23828125, "reward_std": 0.878571093082428, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.48828125, "step": 3501 }, { "completion_length": 159.5625, "epoch": 1.873729266987694, "grad_norm": 0.7977770566940308, "kl": 0.17036418616771698, "learning_rate": 1.8567117722059663e-06, "loss": 0.0068, "reward": 2.01924991607666, "reward_std": 0.9863755702972412, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4255000054836273, "step": 3502 }, { "completion_length": 125.34375, "epoch": 1.8742643124665597, "grad_norm": 0.6543080806732178, "kl": 0.15636157989501953, "learning_rate": 1.8552078306978493e-06, "loss": 0.0063, "reward": 2.60756254196167, "reward_std": 0.6542406678199768, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48256248235702515, "step": 3503 }, { "completion_length": 152.875, "epoch": 1.8747993579454254, "grad_norm": 0.591887891292572, "kl": 0.13820290565490723, "learning_rate": 1.8537041391235345e-06, "loss": 0.0055, "reward": 1.9729374647140503, "reward_std": 1.083890438079834, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4416874945163727, "step": 3504 }, { "completion_length": 113.21875, "epoch": 1.875334403424291, "grad_norm": 0.5810990929603577, "kl": 0.1513388454914093, "learning_rate": 1.852200698065878e-06, "loss": 0.0061, "reward": 2.8125, "reward_std": 0.6754929423332214, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 3505 }, { "completion_length": 140.8125, "epoch": 1.8758694489031567, "grad_norm": 0.854216456413269, "kl": 0.1857050359249115, "learning_rate": 1.850697508107645e-06, "loss": 0.0074, "reward": 2.5672812461853027, "reward_std": 0.9517132043838501, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.45790624618530273, "step": 3506 }, { "completion_length": 112.75, "epoch": 1.8764044943820224, "grad_norm": 0.7908665537834167, "kl": 0.2242640256881714, "learning_rate": 1.8491945698315014e-06, "loss": 0.009, "reward": 2.369499921798706, "reward_std": 0.8277831077575684, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.49449998140335083, "step": 3507 }, { "completion_length": 137.84375, "epoch": 1.8769395398608881, "grad_norm": 2.424633026123047, "kl": 0.2470308095216751, "learning_rate": 1.847691883820012e-06, "loss": 0.0099, "reward": 1.3046875, "reward_std": 0.4764518439769745, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4765625, "step": 3508 }, { "completion_length": 137.65625, "epoch": 1.8774745853397539, "grad_norm": 0.9073361158370972, "kl": 0.15382739901542664, "learning_rate": 1.84618945065565e-06, "loss": 0.0062, "reward": 1.7218124866485596, "reward_std": 0.7014645934104919, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47181248664855957, "step": 3509 }, { "completion_length": 111.9375, "epoch": 1.8780096308186196, "grad_norm": 1.1072806119918823, "kl": 0.19075307250022888, "learning_rate": 1.8446872709207846e-06, "loss": 0.0076, "reward": 2.200906276702881, "reward_std": 0.5968868732452393, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.48215624690055847, "step": 3510 }, { "completion_length": 129.0625, "epoch": 1.8785446762974853, "grad_norm": 1.3261960744857788, "kl": 0.20901258289813995, "learning_rate": 1.84318534519769e-06, "loss": 0.0084, "reward": 1.6606874465942383, "reward_std": 0.7295092940330505, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.45756250619888306, "step": 3511 }, { "completion_length": 112.03125, "epoch": 1.879079721776351, "grad_norm": 1.0190227031707764, "kl": 0.24593548476696014, "learning_rate": 1.8416836740685433e-06, "loss": 0.0098, "reward": 2.6534688472747803, "reward_std": 1.0749268531799316, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4815937280654907, "step": 3512 }, { "completion_length": 144.4375, "epoch": 1.8796147672552168, "grad_norm": 1.0079916715621948, "kl": 0.20139522850513458, "learning_rate": 1.8401822581154188e-06, "loss": 0.0081, "reward": 2.0209999084472656, "reward_std": 0.9671999216079712, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4585000276565552, "step": 3513 }, { "completion_length": 127.59375, "epoch": 1.8801498127340825, "grad_norm": 0.8126810193061829, "kl": 0.1927918940782547, "learning_rate": 1.8386810979202952e-06, "loss": 0.0077, "reward": 1.7335937023162842, "reward_std": 0.7044418454170227, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.45234376192092896, "step": 3514 }, { "completion_length": 115.40625, "epoch": 1.880684858212948, "grad_norm": 2.8060712814331055, "kl": 0.203668475151062, "learning_rate": 1.8371801940650501e-06, "loss": 0.0081, "reward": 1.7654688358306885, "reward_std": 0.7386940717697144, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4529687464237213, "step": 3515 }, { "completion_length": 110.65625, "epoch": 1.8812199036918138, "grad_norm": 1.649170160293579, "kl": 0.23685398697853088, "learning_rate": 1.8356795471314634e-06, "loss": 0.0095, "reward": 2.3280625343322754, "reward_std": 0.8120280504226685, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.499937504529953, "step": 3516 }, { "completion_length": 149.875, "epoch": 1.8817549491706795, "grad_norm": 3.0862069129943848, "kl": 0.18573282659053802, "learning_rate": 1.8341791577012148e-06, "loss": 0.0074, "reward": 1.7031875848770142, "reward_std": 0.8206333518028259, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.421937495470047, "step": 3517 }, { "completion_length": 130.28125, "epoch": 1.882289994649545, "grad_norm": 0.8177196979522705, "kl": 0.1724330484867096, "learning_rate": 1.832679026355883e-06, "loss": 0.0069, "reward": 2.3163437843322754, "reward_std": 0.9278444647789001, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.456968754529953, "step": 3518 }, { "completion_length": 125.0, "epoch": 1.8828250401284108, "grad_norm": 0.6862262487411499, "kl": 0.18723967671394348, "learning_rate": 1.8311791536769485e-06, "loss": 0.0075, "reward": 2.1865625381469727, "reward_std": 0.643362820148468, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48343750834465027, "step": 3519 }, { "completion_length": 127.28125, "epoch": 1.8833600856072765, "grad_norm": 11.274534225463867, "kl": 0.5018193125724792, "learning_rate": 1.82967954024579e-06, "loss": 0.0201, "reward": 1.82212495803833, "reward_std": 1.0096842050552368, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46275001764297485, "step": 3520 }, { "completion_length": 134.84375, "epoch": 1.8838951310861423, "grad_norm": 1.6313103437423706, "kl": 0.253692626953125, "learning_rate": 1.8281801866436868e-06, "loss": 0.0101, "reward": 1.6848437786102295, "reward_std": 0.5932487845420837, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4660937488079071, "step": 3521 }, { "completion_length": 144.96875, "epoch": 1.884430176565008, "grad_norm": 1.9324270486831665, "kl": 0.29521486163139343, "learning_rate": 1.8266810934518175e-06, "loss": 0.0118, "reward": 1.4787812232971191, "reward_std": 0.7756129503250122, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.40065622329711914, "step": 3522 }, { "completion_length": 115.5, "epoch": 1.8849652220438737, "grad_norm": 0.5448943376541138, "kl": 0.15564700961112976, "learning_rate": 1.8251822612512588e-06, "loss": 0.0062, "reward": 2.4194374084472656, "reward_std": 0.8002023100852966, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4819374978542328, "step": 3523 }, { "completion_length": 151.5625, "epoch": 1.8855002675227395, "grad_norm": 1.0331010818481445, "kl": 0.28909429907798767, "learning_rate": 1.8236836906229871e-06, "loss": 0.0116, "reward": 1.1607187986373901, "reward_std": 0.6052305102348328, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.37946873903274536, "step": 3524 }, { "completion_length": 107.84375, "epoch": 1.8860353130016052, "grad_norm": 1.8901264667510986, "kl": 0.20805081725120544, "learning_rate": 1.8221853821478763e-06, "loss": 0.0083, "reward": 1.9258124828338623, "reward_std": 0.7921339273452759, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4726875126361847, "step": 3525 }, { "completion_length": 131.53125, "epoch": 1.886570358480471, "grad_norm": 1.0177500247955322, "kl": 0.22679056227207184, "learning_rate": 1.8206873364067001e-06, "loss": 0.0091, "reward": 1.657406210899353, "reward_std": 0.7427093982696533, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4230312705039978, "step": 3526 }, { "completion_length": 113.0625, "epoch": 1.8871054039593367, "grad_norm": 0.5602735877037048, "kl": 0.1499408483505249, "learning_rate": 1.8191895539801304e-06, "loss": 0.006, "reward": 2.7235937118530273, "reward_std": 0.5729019641876221, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48921874165534973, "step": 3527 }, { "completion_length": 139.9375, "epoch": 1.8876404494382022, "grad_norm": 1.0070778131484985, "kl": 0.19495892524719238, "learning_rate": 1.8176920354487354e-06, "loss": 0.0078, "reward": 2.8029375076293945, "reward_std": 1.0445488691329956, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.47481250762939453, "step": 3528 }, { "completion_length": 127.28125, "epoch": 1.888175494917068, "grad_norm": 0.7709696292877197, "kl": 0.1568801999092102, "learning_rate": 1.816194781392982e-06, "loss": 0.0063, "reward": 2.230062484741211, "reward_std": 0.9710632562637329, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.49568748474121094, "step": 3529 }, { "completion_length": 118.3125, "epoch": 1.8887105403959337, "grad_norm": 1.2024996280670166, "kl": 0.18553990125656128, "learning_rate": 1.8146977923932357e-06, "loss": 0.0074, "reward": 2.2051875591278076, "reward_std": 0.9715661406517029, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.42393749952316284, "step": 3530 }, { "completion_length": 139.09375, "epoch": 1.8892455858747994, "grad_norm": 1.229870319366455, "kl": 0.19296635687351227, "learning_rate": 1.8132010690297559e-06, "loss": 0.0077, "reward": 1.6250312328338623, "reward_std": 0.8006561994552612, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4531562626361847, "step": 3531 }, { "completion_length": 128.71875, "epoch": 1.889780631353665, "grad_norm": 15.28370189666748, "kl": 1.4450238943099976, "learning_rate": 1.811704611882704e-06, "loss": 0.0578, "reward": 2.0750937461853027, "reward_std": 0.9480209350585938, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.45009374618530273, "step": 3532 }, { "completion_length": 105.5625, "epoch": 1.8903156768325307, "grad_norm": 2.4971063137054443, "kl": 0.23689565062522888, "learning_rate": 1.810208421532133e-06, "loss": 0.0095, "reward": 2.382625102996826, "reward_std": 0.5134764909744263, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.492000013589859, "step": 3533 }, { "completion_length": 132.46875, "epoch": 1.8908507223113964, "grad_norm": 1.2927873134613037, "kl": 0.17061015963554382, "learning_rate": 1.8087124985579956e-06, "loss": 0.0068, "reward": 1.748437523841858, "reward_std": 0.7010738849639893, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4828125238418579, "step": 3534 }, { "completion_length": 125.65625, "epoch": 1.8913857677902621, "grad_norm": 1.294956922531128, "kl": 0.17534944415092468, "learning_rate": 1.8072168435401424e-06, "loss": 0.007, "reward": 2.161281108856201, "reward_std": 0.607368528842926, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4894062578678131, "step": 3535 }, { "completion_length": 110.59375, "epoch": 1.8919208132691279, "grad_norm": 6.544619560241699, "kl": 0.5151379108428955, "learning_rate": 1.8057214570583148e-06, "loss": 0.0206, "reward": 1.8982499837875366, "reward_std": 0.6861743927001953, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4919999837875366, "step": 3536 }, { "completion_length": 135.5625, "epoch": 1.8924558587479936, "grad_norm": 2.2635483741760254, "kl": 0.14216934144496918, "learning_rate": 1.8042263396921555e-06, "loss": 0.0057, "reward": 1.11928129196167, "reward_std": 0.40559330582618713, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.40053123235702515, "step": 3537 }, { "completion_length": 135.28125, "epoch": 1.8929909042268593, "grad_norm": 0.8888129591941833, "kl": 0.1847999393939972, "learning_rate": 1.8027314920211991e-06, "loss": 0.0074, "reward": 2.2281875610351562, "reward_std": 1.1345875263214111, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4625625014305115, "step": 3538 }, { "completion_length": 105.34375, "epoch": 1.893525949705725, "grad_norm": 1.7767504453659058, "kl": 0.23035040497779846, "learning_rate": 1.8012369146248779e-06, "loss": 0.0092, "reward": 2.1279687881469727, "reward_std": 0.6402266025543213, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47171875834465027, "step": 3539 }, { "completion_length": 149.59375, "epoch": 1.8940609951845908, "grad_norm": 25.601879119873047, "kl": 0.6760097742080688, "learning_rate": 1.7997426080825193e-06, "loss": 0.027, "reward": 2.055500030517578, "reward_std": 0.9906814098358154, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.44612500071525574, "step": 3540 }, { "completion_length": 120.84375, "epoch": 1.8945960406634565, "grad_norm": 2.8176217079162598, "kl": 0.18413271009922028, "learning_rate": 1.7982485729733438e-06, "loss": 0.0074, "reward": 2.2447500228881836, "reward_std": 0.6530018448829651, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4634999930858612, "step": 3541 }, { "completion_length": 121.53125, "epoch": 1.895131086142322, "grad_norm": 2.0523929595947266, "kl": 0.191915363073349, "learning_rate": 1.7967548098764694e-06, "loss": 0.0077, "reward": 2.407843589782715, "reward_std": 0.4509901702404022, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.47034376859664917, "step": 3542 }, { "completion_length": 157.90625, "epoch": 1.8956661316211878, "grad_norm": 0.8329558372497559, "kl": 0.16408413648605347, "learning_rate": 1.7952613193709062e-06, "loss": 0.0066, "reward": 1.2628124952316284, "reward_std": 0.8912991881370544, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.3721874952316284, "step": 3543 }, { "completion_length": 140.0625, "epoch": 1.8962011771000535, "grad_norm": 0.42661088705062866, "kl": 0.14174555242061615, "learning_rate": 1.7937681020355606e-06, "loss": 0.0057, "reward": 1.9677187204360962, "reward_std": 0.29450085759162903, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4833437502384186, "step": 3544 }, { "completion_length": 146.09375, "epoch": 1.896736222578919, "grad_norm": 2.1433372497558594, "kl": 0.27895990014076233, "learning_rate": 1.792275158449232e-06, "loss": 0.0112, "reward": 1.818750023841858, "reward_std": 0.4659160375595093, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4437499940395355, "step": 3545 }, { "completion_length": 158.25, "epoch": 1.8972712680577848, "grad_norm": 0.8090018033981323, "kl": 0.15576887130737305, "learning_rate": 1.7907824891906133e-06, "loss": 0.0062, "reward": 1.3175312280654907, "reward_std": 0.7277944087982178, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4112812578678131, "step": 3546 }, { "completion_length": 159.15625, "epoch": 1.8978063135366505, "grad_norm": 0.4797896444797516, "kl": 0.15681768953800201, "learning_rate": 1.7892900948382929e-06, "loss": 0.0063, "reward": 2.181906223297119, "reward_std": 0.9940641522407532, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.40065625309944153, "step": 3547 }, { "completion_length": 131.4375, "epoch": 1.8983413590155163, "grad_norm": 0.7371060848236084, "kl": 0.16078463196754456, "learning_rate": 1.7877979759707504e-06, "loss": 0.0064, "reward": 1.5736563205718994, "reward_std": 0.7422560453414917, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.43303126096725464, "step": 3548 }, { "completion_length": 109.3125, "epoch": 1.898876404494382, "grad_norm": 1.1003540754318237, "kl": 0.2185632884502411, "learning_rate": 1.78630613316636e-06, "loss": 0.0087, "reward": 1.8861563205718994, "reward_std": 0.8745232224464417, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.44865626096725464, "step": 3549 }, { "completion_length": 131.3125, "epoch": 1.8994114499732477, "grad_norm": 1.2914073467254639, "kl": 0.18449825048446655, "learning_rate": 1.784814567003389e-06, "loss": 0.0074, "reward": 1.741937518119812, "reward_std": 0.7660968899726868, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.445062518119812, "step": 3550 }, { "completion_length": 152.4375, "epoch": 1.8999464954521135, "grad_norm": 1.2045764923095703, "kl": 0.13214367628097534, "learning_rate": 1.7833232780599963e-06, "loss": 0.0053, "reward": 1.574312448501587, "reward_std": 0.7733035683631897, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4649375081062317, "step": 3551 }, { "completion_length": 133.59375, "epoch": 1.9004815409309792, "grad_norm": 0.7348704934120178, "kl": 0.24212241172790527, "learning_rate": 1.7818322669142355e-06, "loss": 0.0097, "reward": 1.5220000743865967, "reward_std": 0.49121156334877014, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4595000147819519, "step": 3552 }, { "completion_length": 135.0625, "epoch": 1.901016586409845, "grad_norm": 1.8336738348007202, "kl": 0.22492536902427673, "learning_rate": 1.7803415341440484e-06, "loss": 0.009, "reward": 2.3774375915527344, "reward_std": 0.9150670766830444, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4243125021457672, "step": 3553 }, { "completion_length": 139.125, "epoch": 1.9015516318887107, "grad_norm": 1.4858345985412598, "kl": 0.1890249252319336, "learning_rate": 1.7788510803272734e-06, "loss": 0.0076, "reward": 1.8355624675750732, "reward_std": 0.5579254627227783, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.41368749737739563, "step": 3554 }, { "completion_length": 141.125, "epoch": 1.9020866773675762, "grad_norm": 0.7495045065879822, "kl": 0.1674463152885437, "learning_rate": 1.7773609060416403e-06, "loss": 0.0067, "reward": 1.296875, "reward_std": 0.35730528831481934, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.453125, "step": 3555 }, { "completion_length": 130.25, "epoch": 1.902621722846442, "grad_norm": 1.0177652835845947, "kl": 0.23165683448314667, "learning_rate": 1.775871011864766e-06, "loss": 0.0093, "reward": 2.440812587738037, "reward_std": 0.7798510193824768, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.44081249833106995, "step": 3556 }, { "completion_length": 124.71875, "epoch": 1.9031567683253077, "grad_norm": 0.7317509651184082, "kl": 0.2533382177352905, "learning_rate": 1.7743813983741651e-06, "loss": 0.0101, "reward": 2.726687431335449, "reward_std": 0.59726482629776, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.476687490940094, "step": 3557 }, { "completion_length": 124.46875, "epoch": 1.9036918138041734, "grad_norm": 0.6757426857948303, "kl": 0.1740218847990036, "learning_rate": 1.7728920661472379e-06, "loss": 0.007, "reward": 1.890625, "reward_std": 0.49495798349380493, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.5, "step": 3558 }, { "completion_length": 148.1875, "epoch": 1.904226859283039, "grad_norm": 0.6801052093505859, "kl": 0.1859596222639084, "learning_rate": 1.7714030157612795e-06, "loss": 0.0074, "reward": 1.4112187623977661, "reward_std": 0.14114990830421448, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.4893437623977661, "step": 3559 }, { "completion_length": 148.125, "epoch": 1.9047619047619047, "grad_norm": 1.0956534147262573, "kl": 0.20811378955841064, "learning_rate": 1.7699142477934739e-06, "loss": 0.0083, "reward": 1.469249963760376, "reward_std": 0.719201922416687, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.40674999356269836, "step": 3560 }, { "completion_length": 125.59375, "epoch": 1.9052969502407704, "grad_norm": 11.218315124511719, "kl": 0.6262843608856201, "learning_rate": 1.7684257628208962e-06, "loss": 0.0251, "reward": 2.274343729019165, "reward_std": 0.9829393029212952, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46184372901916504, "step": 3561 }, { "completion_length": 138.6875, "epoch": 1.9058319957196361, "grad_norm": 1.0643168687820435, "kl": 0.21472495794296265, "learning_rate": 1.766937561420512e-06, "loss": 0.0086, "reward": 2.0234375, "reward_std": 0.699647843837738, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4765625, "step": 3562 }, { "completion_length": 121.25, "epoch": 1.9063670411985019, "grad_norm": 0.8311087489128113, "kl": 0.2328786998987198, "learning_rate": 1.765449644169176e-06, "loss": 0.0093, "reward": 1.9924061298370361, "reward_std": 1.115278720855713, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4611562490463257, "step": 3563 }, { "completion_length": 116.15625, "epoch": 1.9069020866773676, "grad_norm": 0.6356172561645508, "kl": 0.2236102819442749, "learning_rate": 1.7639620116436334e-06, "loss": 0.0089, "reward": 2.299062728881836, "reward_std": 0.24162691831588745, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4709375202655792, "step": 3564 }, { "completion_length": 154.53125, "epoch": 1.9074371321562333, "grad_norm": 8.146946907043457, "kl": 0.414638876914978, "learning_rate": 1.7624746644205204e-06, "loss": 0.0166, "reward": 1.3372187614440918, "reward_std": 0.8343918919563293, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.3997187614440918, "step": 3565 }, { "completion_length": 132.9375, "epoch": 1.907972177635099, "grad_norm": 0.9321352243423462, "kl": 0.2820819318294525, "learning_rate": 1.76098760307636e-06, "loss": 0.0113, "reward": 2.206718683242798, "reward_std": 0.9282479286193848, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4567187428474426, "step": 3566 }, { "completion_length": 123.21875, "epoch": 1.9085072231139648, "grad_norm": 0.8198347091674805, "kl": 0.24981412291526794, "learning_rate": 1.7595008281875658e-06, "loss": 0.01, "reward": 2.5347814559936523, "reward_std": 0.8646983504295349, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48790624737739563, "step": 3567 }, { "completion_length": 140.875, "epoch": 1.9090422685928305, "grad_norm": 5.315619468688965, "kl": 0.3177083134651184, "learning_rate": 1.7580143403304412e-06, "loss": 0.0127, "reward": 1.8027812242507935, "reward_std": 0.5667567253112793, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.44340625405311584, "step": 3568 }, { "completion_length": 142.65625, "epoch": 1.909577314071696, "grad_norm": 1.8585164546966553, "kl": 0.2241154909133911, "learning_rate": 1.7565281400811763e-06, "loss": 0.009, "reward": 1.409906268119812, "reward_std": 0.6885267496109009, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4255312383174896, "step": 3569 }, { "completion_length": 102.625, "epoch": 1.9101123595505618, "grad_norm": 0.9279762506484985, "kl": 0.27613216638565063, "learning_rate": 1.7550422280158514e-06, "loss": 0.011, "reward": 1.6489999294281006, "reward_std": 0.5741342306137085, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.49274998903274536, "step": 3570 }, { "completion_length": 134.46875, "epoch": 1.9106474050294275, "grad_norm": 0.8881731629371643, "kl": 0.20117530226707458, "learning_rate": 1.7535566047104336e-06, "loss": 0.008, "reward": 2.379218816757202, "reward_std": 1.1712921857833862, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4573437571525574, "step": 3571 }, { "completion_length": 108.3125, "epoch": 1.911182450508293, "grad_norm": 0.5651348829269409, "kl": 0.1662769913673401, "learning_rate": 1.7520712707407798e-06, "loss": 0.0067, "reward": 2.7691874504089355, "reward_std": 0.6465834379196167, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4879375100135803, "step": 3572 }, { "completion_length": 164.5625, "epoch": 1.9117174959871588, "grad_norm": 0.5350082516670227, "kl": 0.15645422041416168, "learning_rate": 1.750586226682634e-06, "loss": 0.0063, "reward": 1.3930624723434448, "reward_std": 0.8990697860717773, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.3930625021457672, "step": 3573 }, { "completion_length": 115.78125, "epoch": 1.9122525414660245, "grad_norm": 0.8725147843360901, "kl": 0.2366858273744583, "learning_rate": 1.7491014731116274e-06, "loss": 0.0095, "reward": 2.46806263923645, "reward_std": 0.9263399839401245, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.49931249022483826, "step": 3574 }, { "completion_length": 150.09375, "epoch": 1.9127875869448903, "grad_norm": 0.5049241781234741, "kl": 0.13595940172672272, "learning_rate": 1.7476170106032796e-06, "loss": 0.0054, "reward": 1.65500009059906, "reward_std": 1.0567009449005127, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4050000011920929, "step": 3575 }, { "completion_length": 127.15625, "epoch": 1.913322632423756, "grad_norm": 2.4833264350891113, "kl": 0.24029722809791565, "learning_rate": 1.7461328397329951e-06, "loss": 0.0096, "reward": 2.288343667984009, "reward_std": 0.869870662689209, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.42896875739097595, "step": 3576 }, { "completion_length": 119.0, "epoch": 1.9138576779026217, "grad_norm": 0.5504969954490662, "kl": 0.15871894359588623, "learning_rate": 1.744648961076068e-06, "loss": 0.0063, "reward": 2.0859375, "reward_std": 0.6954938173294067, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.4921875, "step": 3577 }, { "completion_length": 140.03125, "epoch": 1.9143927233814875, "grad_norm": 1.2064731121063232, "kl": 0.23717528581619263, "learning_rate": 1.7431653752076793e-06, "loss": 0.0095, "reward": 2.37890625, "reward_std": 0.9414490461349487, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48828125, "step": 3578 }, { "completion_length": 133.59375, "epoch": 1.9149277688603532, "grad_norm": 0.9654925465583801, "kl": 0.14502957463264465, "learning_rate": 1.7416820827028927e-06, "loss": 0.0058, "reward": 2.4374375343322754, "reward_std": 0.8282729387283325, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.499937504529953, "step": 3579 }, { "completion_length": 142.625, "epoch": 1.915462814339219, "grad_norm": 1.031057596206665, "kl": 0.1925884634256363, "learning_rate": 1.7401990841366634e-06, "loss": 0.0077, "reward": 1.6309688091278076, "reward_std": 1.0525914430618286, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.42784374952316284, "step": 3580 }, { "completion_length": 119.15625, "epoch": 1.9159978598180847, "grad_norm": 0.8138336539268494, "kl": 0.2176242172718048, "learning_rate": 1.738716380083828e-06, "loss": 0.0087, "reward": 2.6507186889648438, "reward_std": 0.8884573578834534, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4475937485694885, "step": 3581 }, { "completion_length": 143.03125, "epoch": 1.9165329052969502, "grad_norm": 1.3519632816314697, "kl": 0.20262426137924194, "learning_rate": 1.7372339711191116e-06, "loss": 0.0081, "reward": 2.087156295776367, "reward_std": 0.9034863710403442, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4777812361717224, "step": 3582 }, { "completion_length": 122.25, "epoch": 1.917067950775816, "grad_norm": 1.037889003753662, "kl": 0.2085065245628357, "learning_rate": 1.7357518578171258e-06, "loss": 0.0083, "reward": 2.059187650680542, "reward_std": 0.9111123085021973, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48106253147125244, "step": 3583 }, { "completion_length": 111.6875, "epoch": 1.9176029962546817, "grad_norm": 9.233302116394043, "kl": 0.7063016891479492, "learning_rate": 1.7342700407523641e-06, "loss": 0.0283, "reward": 1.7420625686645508, "reward_std": 0.4092356562614441, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.492062509059906, "step": 3584 }, { "completion_length": 165.46875, "epoch": 1.9181380417335474, "grad_norm": 0.4921531081199646, "kl": 0.13902661204338074, "learning_rate": 1.7327885204992084e-06, "loss": 0.0056, "reward": 1.1484999656677246, "reward_std": 0.4420652985572815, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.398499995470047, "step": 3585 }, { "completion_length": 113.34375, "epoch": 1.918673087212413, "grad_norm": 0.7800278663635254, "kl": 0.1680058389902115, "learning_rate": 1.7313072976319234e-06, "loss": 0.0067, "reward": 2.76953125, "reward_std": 0.8151811361312866, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.48828125, "step": 3586 }, { "completion_length": 125.46875, "epoch": 1.9192081326912787, "grad_norm": 0.6624802947044373, "kl": 0.21820437908172607, "learning_rate": 1.72982637272466e-06, "loss": 0.0087, "reward": 2.0462498664855957, "reward_std": 0.6308003664016724, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46812498569488525, "step": 3587 }, { "completion_length": 113.28125, "epoch": 1.9197431781701444, "grad_norm": 1.4342964887619019, "kl": 0.27137869596481323, "learning_rate": 1.7283457463514536e-06, "loss": 0.0109, "reward": 2.437687397003174, "reward_std": 0.7631034851074219, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4845625162124634, "step": 3588 }, { "completion_length": 135.96875, "epoch": 1.9202782236490101, "grad_norm": 0.8845089673995972, "kl": 0.19285094738006592, "learning_rate": 1.7268654190862222e-06, "loss": 0.0077, "reward": 2.015625, "reward_std": 0.7741920948028564, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.5, "step": 3589 }, { "completion_length": 141.03125, "epoch": 1.9208132691278759, "grad_norm": 0.806659460067749, "kl": 0.2066151201725006, "learning_rate": 1.7253853915027703e-06, "loss": 0.0083, "reward": 1.8526874780654907, "reward_std": 0.7289441823959351, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.3839375078678131, "step": 3590 }, { "completion_length": 117.34375, "epoch": 1.9213483146067416, "grad_norm": 0.996834397315979, "kl": 0.24021300673484802, "learning_rate": 1.7239056641747837e-06, "loss": 0.0096, "reward": 2.12137508392334, "reward_std": 0.6582674980163574, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4807499945163727, "step": 3591 }, { "completion_length": 140.0, "epoch": 1.9218833600856073, "grad_norm": 1.1479771137237549, "kl": 0.22884535789489746, "learning_rate": 1.7224262376758338e-06, "loss": 0.0092, "reward": 2.2421875, "reward_std": 1.00902259349823, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4765625, "step": 3592 }, { "completion_length": 112.15625, "epoch": 1.922418405564473, "grad_norm": 1.8789595365524292, "kl": 0.20447945594787598, "learning_rate": 1.7209471125793751e-06, "loss": 0.0082, "reward": 2.577531337738037, "reward_std": 0.642525315284729, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48378124833106995, "step": 3593 }, { "completion_length": 123.4375, "epoch": 1.9229534510433388, "grad_norm": 1.120133399963379, "kl": 0.19422878324985504, "learning_rate": 1.7194682894587434e-06, "loss": 0.0078, "reward": 2.4933438301086426, "reward_std": 0.9669500589370728, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4777187407016754, "step": 3594 }, { "completion_length": 135.03125, "epoch": 1.9234884965222045, "grad_norm": 0.6539177298545837, "kl": 0.16392186284065247, "learning_rate": 1.717989768887161e-06, "loss": 0.0066, "reward": 2.466249942779541, "reward_std": 0.6076903939247131, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4818750023841858, "step": 3595 }, { "completion_length": 101.21875, "epoch": 1.92402354200107, "grad_norm": 1.1345294713974, "kl": 0.18818393349647522, "learning_rate": 1.7165115514377279e-06, "loss": 0.0075, "reward": 1.403499960899353, "reward_std": 0.24241863191127777, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4816249907016754, "step": 3596 }, { "completion_length": 149.0625, "epoch": 1.9245585874799358, "grad_norm": 5.729258060455322, "kl": 0.3576871156692505, "learning_rate": 1.7150336376834317e-06, "loss": 0.0143, "reward": 1.4318125247955322, "reward_std": 0.6359749436378479, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.38493749499320984, "step": 3597 }, { "completion_length": 132.28125, "epoch": 1.9250936329588015, "grad_norm": 1.6982632875442505, "kl": 0.2136785089969635, "learning_rate": 1.7135560281971405e-06, "loss": 0.0085, "reward": 1.6295312643051147, "reward_std": 0.4800267517566681, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.39515626430511475, "step": 3598 }, { "completion_length": 147.8125, "epoch": 1.925628678437667, "grad_norm": 1.070813775062561, "kl": 0.20311959087848663, "learning_rate": 1.7120787235516017e-06, "loss": 0.0081, "reward": 1.3852187395095825, "reward_std": 0.5704448223114014, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4477187395095825, "step": 3599 }, { "completion_length": 136.75, "epoch": 1.9261637239165328, "grad_norm": 0.6405466794967651, "kl": 0.15467774868011475, "learning_rate": 1.7106017243194486e-06, "loss": 0.0062, "reward": 2.0126562118530273, "reward_std": 0.8594996333122253, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46578124165534973, "step": 3600 }, { "completion_length": 132.09375, "epoch": 1.9266987693953985, "grad_norm": 6.617677688598633, "kl": 0.7673165202140808, "learning_rate": 1.7091250310731954e-06, "loss": 0.0307, "reward": 2.1822187900543213, "reward_std": 0.773672878742218, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4322187602519989, "step": 3601 }, { "completion_length": 142.96875, "epoch": 1.9272338148742643, "grad_norm": 2.0217292308807373, "kl": 0.30577394366264343, "learning_rate": 1.7076486443852336e-06, "loss": 0.0122, "reward": 1.2117812633514404, "reward_std": 0.571678638458252, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.43053126335144043, "step": 3602 }, { "completion_length": 144.15625, "epoch": 1.92776886035313, "grad_norm": 0.7213843464851379, "kl": 0.18460577726364136, "learning_rate": 1.7061725648278424e-06, "loss": 0.0074, "reward": 1.8640937805175781, "reward_std": 1.0300847291946411, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.45784375071525574, "step": 3603 }, { "completion_length": 141.21875, "epoch": 1.9283039058319957, "grad_norm": 0.9767621755599976, "kl": 0.17395907640457153, "learning_rate": 1.7046967929731758e-06, "loss": 0.007, "reward": 1.671625018119812, "reward_std": 0.8299250602722168, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4528749883174896, "step": 3604 }, { "completion_length": 160.625, "epoch": 1.9288389513108615, "grad_norm": 0.9827771186828613, "kl": 0.25376996397972107, "learning_rate": 1.703221329393272e-06, "loss": 0.0102, "reward": 1.5181875228881836, "reward_std": 0.8330023288726807, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4400625228881836, "step": 3605 }, { "completion_length": 139.0, "epoch": 1.9293739967897272, "grad_norm": 2.631957530975342, "kl": 0.4871513545513153, "learning_rate": 1.7017461746600506e-06, "loss": 0.0195, "reward": 1.6573749780654907, "reward_std": 0.6085695028305054, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4542500078678131, "step": 3606 }, { "completion_length": 125.125, "epoch": 1.929909042268593, "grad_norm": 1.0316933393478394, "kl": 0.14688396453857422, "learning_rate": 1.7002713293453077e-06, "loss": 0.0059, "reward": 2.5404999256134033, "reward_std": 0.6936378479003906, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4779999852180481, "step": 3607 }, { "completion_length": 141.71875, "epoch": 1.9304440877474587, "grad_norm": 0.801716685295105, "kl": 0.17657671868801117, "learning_rate": 1.6987967940207228e-06, "loss": 0.0071, "reward": 2.007312536239624, "reward_std": 0.7496212720870972, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.47606250643730164, "step": 3608 }, { "completion_length": 135.34375, "epoch": 1.9309791332263242, "grad_norm": 1.4506607055664062, "kl": 0.22146771848201752, "learning_rate": 1.6973225692578532e-06, "loss": 0.0089, "reward": 1.4305000305175781, "reward_std": 0.5741074085235596, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.44612500071525574, "step": 3609 }, { "completion_length": 145.875, "epoch": 1.93151417870519, "grad_norm": 1.3611061573028564, "kl": 0.40221744775772095, "learning_rate": 1.695848655628137e-06, "loss": 0.0161, "reward": 2.5875937938690186, "reward_std": 0.9097503423690796, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4782187342643738, "step": 3610 }, { "completion_length": 117.625, "epoch": 1.9320492241840557, "grad_norm": 1.203639268875122, "kl": 0.2947751581668854, "learning_rate": 1.6943750537028921e-06, "loss": 0.0118, "reward": 2.6947813034057617, "reward_std": 0.39888566732406616, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.49165624380111694, "step": 3611 }, { "completion_length": 125.875, "epoch": 1.9325842696629212, "grad_norm": 0.6675640940666199, "kl": 0.16819068789482117, "learning_rate": 1.6929017640533141e-06, "loss": 0.0067, "reward": 2.8697500228881836, "reward_std": 0.622578501701355, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4791249930858612, "step": 3612 }, { "completion_length": 133.40625, "epoch": 1.933119315141787, "grad_norm": 1.351777195930481, "kl": 0.16770678758621216, "learning_rate": 1.6914287872504786e-06, "loss": 0.0067, "reward": 2.4726250171661377, "reward_std": 0.8927881121635437, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4413750171661377, "step": 3613 }, { "completion_length": 107.0, "epoch": 1.9336543606206527, "grad_norm": 1.7126765251159668, "kl": 0.33988192677497864, "learning_rate": 1.689956123865339e-06, "loss": 0.0136, "reward": 2.915343761444092, "reward_std": 0.5693936347961426, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.4934687614440918, "step": 3614 }, { "completion_length": 103.90625, "epoch": 1.9341894060995184, "grad_norm": 1.1247825622558594, "kl": 0.3551957607269287, "learning_rate": 1.6884837744687286e-06, "loss": 0.0142, "reward": 1.7870937585830688, "reward_std": 0.768494725227356, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47459375858306885, "step": 3615 }, { "completion_length": 123.09375, "epoch": 1.9347244515783841, "grad_norm": 1.8624861240386963, "kl": 0.1738996058702469, "learning_rate": 1.6870117396313585e-06, "loss": 0.007, "reward": 2.2565624713897705, "reward_std": 0.3739815354347229, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.4909375011920929, "step": 3616 }, { "completion_length": 126.40625, "epoch": 1.9352594970572499, "grad_norm": 0.9083009362220764, "kl": 0.1401093304157257, "learning_rate": 1.6855400199238165e-06, "loss": 0.0056, "reward": 2.466156244277954, "reward_std": 0.5253484845161438, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4817812442779541, "step": 3617 }, { "completion_length": 151.96875, "epoch": 1.9357945425361156, "grad_norm": 0.6926270127296448, "kl": 0.17175790667533875, "learning_rate": 1.6840686159165708e-06, "loss": 0.0069, "reward": 1.5069375038146973, "reward_std": 0.805849552154541, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.39756250381469727, "step": 3618 }, { "completion_length": 140.15625, "epoch": 1.9363295880149813, "grad_norm": 0.6056899428367615, "kl": 0.1494138538837433, "learning_rate": 1.6825975281799633e-06, "loss": 0.006, "reward": 2.01953125, "reward_std": 0.9036953449249268, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48828125, "step": 3619 }, { "completion_length": 141.34375, "epoch": 1.936864633493847, "grad_norm": 0.5704240798950195, "kl": 0.18140944838523865, "learning_rate": 1.6811267572842182e-06, "loss": 0.0073, "reward": 1.8803750276565552, "reward_std": 0.6749496459960938, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4584999978542328, "step": 3620 }, { "completion_length": 130.96875, "epoch": 1.9373996789727128, "grad_norm": 2.9944944381713867, "kl": 0.1576111614704132, "learning_rate": 1.6796563037994348e-06, "loss": 0.0063, "reward": 1.9202499389648438, "reward_std": 0.6386164426803589, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4827499985694885, "step": 3621 }, { "completion_length": 152.3125, "epoch": 1.9379347244515785, "grad_norm": 1.1876699924468994, "kl": 0.2572234570980072, "learning_rate": 1.6781861682955864e-06, "loss": 0.0103, "reward": 1.1767187118530273, "reward_std": 0.36413300037384033, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.41109374165534973, "step": 3622 }, { "completion_length": 110.9375, "epoch": 1.938469769930444, "grad_norm": 0.739914059638977, "kl": 0.20247295498847961, "learning_rate": 1.6767163513425288e-06, "loss": 0.0081, "reward": 2.5810625553131104, "reward_std": 0.6192982196807861, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4716874957084656, "step": 3623 }, { "completion_length": 101.5625, "epoch": 1.9390048154093098, "grad_norm": 1.4085193872451782, "kl": 0.17900192737579346, "learning_rate": 1.6752468535099887e-06, "loss": 0.0072, "reward": 2.0024375915527344, "reward_std": 0.5267846584320068, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4868125021457672, "step": 3624 }, { "completion_length": 149.84375, "epoch": 1.9395398608881755, "grad_norm": 4.981425762176514, "kl": 0.16650155186653137, "learning_rate": 1.6737776753675723e-06, "loss": 0.0067, "reward": 1.8942186832427979, "reward_std": 1.171097993850708, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.3942187428474426, "step": 3625 }, { "completion_length": 124.5625, "epoch": 1.940074906367041, "grad_norm": 1.7102470397949219, "kl": 0.18756401538848877, "learning_rate": 1.6723088174847624e-06, "loss": 0.0075, "reward": 2.941812515258789, "reward_std": 1.0016041994094849, "rewards/correctness_reward_func": 1.5625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4730624854564667, "step": 3626 }, { "completion_length": 141.75, "epoch": 1.9406099518459068, "grad_norm": 0.9030538201332092, "kl": 0.21471133828163147, "learning_rate": 1.670840280430915e-06, "loss": 0.0086, "reward": 1.8806250095367432, "reward_std": 1.1682742834091187, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.44312500953674316, "step": 3627 }, { "completion_length": 109.0625, "epoch": 1.9411449973247725, "grad_norm": 0.7795835137367249, "kl": 0.16525031626224518, "learning_rate": 1.6693720647752642e-06, "loss": 0.0066, "reward": 2.3537187576293945, "reward_std": 0.5025510787963867, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.49434375762939453, "step": 3628 }, { "completion_length": 130.9375, "epoch": 1.9416800428036383, "grad_norm": 1.0435917377471924, "kl": 0.13447412848472595, "learning_rate": 1.6679041710869176e-06, "loss": 0.0054, "reward": 2.5787501335144043, "reward_std": 0.504249632358551, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46937501430511475, "step": 3629 }, { "completion_length": 137.21875, "epoch": 1.942215088282504, "grad_norm": 1.2385503053665161, "kl": 0.18459518253803253, "learning_rate": 1.6664365999348594e-06, "loss": 0.0074, "reward": 1.751312494277954, "reward_std": 0.9357926249504089, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4388124942779541, "step": 3630 }, { "completion_length": 132.28125, "epoch": 1.9427501337613697, "grad_norm": 2.144028902053833, "kl": 0.22888454794883728, "learning_rate": 1.664969351887949e-06, "loss": 0.0092, "reward": 1.9893437623977661, "reward_std": 0.6703600287437439, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4737187623977661, "step": 3631 }, { "completion_length": 126.6875, "epoch": 1.9432851792402355, "grad_norm": 1.8257677555084229, "kl": 0.3526953458786011, "learning_rate": 1.6635024275149186e-06, "loss": 0.0141, "reward": 1.8676249980926514, "reward_std": 1.0877172946929932, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.43012499809265137, "step": 3632 }, { "completion_length": 129.78125, "epoch": 1.9438202247191012, "grad_norm": 4.6041998863220215, "kl": 0.4151003360748291, "learning_rate": 1.662035827384377e-06, "loss": 0.0166, "reward": 2.5146875381469727, "reward_std": 1.069458246231079, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4678124785423279, "step": 3633 }, { "completion_length": 152.375, "epoch": 1.944355270197967, "grad_norm": 3.3689510822296143, "kl": 0.28022056818008423, "learning_rate": 1.660569552064806e-06, "loss": 0.0112, "reward": 2.1650311946868896, "reward_std": 1.240608811378479, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4462812542915344, "step": 3634 }, { "completion_length": 123.96875, "epoch": 1.9448903156768327, "grad_norm": 0.8042221069335938, "kl": 0.2784610986709595, "learning_rate": 1.6591036021245622e-06, "loss": 0.0111, "reward": 2.3630623817443848, "reward_std": 0.49845045804977417, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4724375009536743, "step": 3635 }, { "completion_length": 132.15625, "epoch": 1.9454253611556982, "grad_norm": 1.9417788982391357, "kl": 0.18575742840766907, "learning_rate": 1.6576379781318768e-06, "loss": 0.0074, "reward": 2.1449999809265137, "reward_std": 0.723002552986145, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.45749998092651367, "step": 3636 }, { "completion_length": 143.8125, "epoch": 1.945960406634564, "grad_norm": 1.0139110088348389, "kl": 0.14902406930923462, "learning_rate": 1.6561726806548523e-06, "loss": 0.006, "reward": 2.7907187938690186, "reward_std": 0.9218019843101501, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4782187342643738, "step": 3637 }, { "completion_length": 118.9375, "epoch": 1.9464954521134297, "grad_norm": 0.7329988479614258, "kl": 0.20982033014297485, "learning_rate": 1.6547077102614664e-06, "loss": 0.0084, "reward": 2.44140625, "reward_std": 0.5595835447311401, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.48828125, "step": 3638 }, { "completion_length": 128.0625, "epoch": 1.9470304975922952, "grad_norm": 4.9392523765563965, "kl": 0.3137243390083313, "learning_rate": 1.6532430675195706e-06, "loss": 0.0125, "reward": 2.1970937252044678, "reward_std": 0.9771965742111206, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46271875500679016, "step": 3639 }, { "completion_length": 128.4375, "epoch": 1.947565543071161, "grad_norm": 1.1298784017562866, "kl": 0.17210620641708374, "learning_rate": 1.6517787529968877e-06, "loss": 0.0069, "reward": 1.859375, "reward_std": 0.6212736368179321, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.5, "step": 3640 }, { "completion_length": 128.25, "epoch": 1.9481005885500267, "grad_norm": 1.3673607110977173, "kl": 0.41685956716537476, "learning_rate": 1.6503147672610149e-06, "loss": 0.0167, "reward": 2.79325008392334, "reward_std": 0.8271589279174805, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.48075002431869507, "step": 3641 }, { "completion_length": 137.0, "epoch": 1.9486356340288924, "grad_norm": 1.113191843032837, "kl": 0.20151440799236298, "learning_rate": 1.6488511108794194e-06, "loss": 0.0081, "reward": 1.7838125228881836, "reward_std": 0.8500303626060486, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4556874930858612, "step": 3642 }, { "completion_length": 128.34375, "epoch": 1.9491706795077581, "grad_norm": 1.0854902267456055, "kl": 0.1726665198802948, "learning_rate": 1.6473877844194438e-06, "loss": 0.0069, "reward": 2.2890000343322754, "reward_std": 0.9025948643684387, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.476500004529953, "step": 3643 }, { "completion_length": 126.125, "epoch": 1.9497057249866239, "grad_norm": 0.9651102423667908, "kl": 0.192527174949646, "learning_rate": 1.6459247884483027e-06, "loss": 0.0077, "reward": 2.375, "reward_std": 0.9971315860748291, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.5, "step": 3644 }, { "completion_length": 125.34375, "epoch": 1.9502407704654896, "grad_norm": 1.0490500926971436, "kl": 0.29086625576019287, "learning_rate": 1.6444621235330777e-06, "loss": 0.0116, "reward": 2.0428123474121094, "reward_std": 0.7822909355163574, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4803124964237213, "step": 3645 }, { "completion_length": 132.1875, "epoch": 1.9507758159443553, "grad_norm": 0.8850722908973694, "kl": 0.2727244198322296, "learning_rate": 1.6429997902407296e-06, "loss": 0.0109, "reward": 1.7039062976837158, "reward_std": 0.7676718235015869, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46953123807907104, "step": 3646 }, { "completion_length": 115.65625, "epoch": 1.951310861423221, "grad_norm": 1.5217679738998413, "kl": 0.21189001202583313, "learning_rate": 1.6415377891380846e-06, "loss": 0.0085, "reward": 2.609375, "reward_std": 0.6200236678123474, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.5, "step": 3647 }, { "completion_length": 137.46875, "epoch": 1.9518459069020868, "grad_norm": 8.03663444519043, "kl": 1.3867285251617432, "learning_rate": 1.640076120791842e-06, "loss": 0.0555, "reward": 1.238937497138977, "reward_std": 0.7612828612327576, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.203125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.37956249713897705, "step": 3648 }, { "completion_length": 124.90625, "epoch": 1.9523809523809523, "grad_norm": 1.937591552734375, "kl": 0.20294195413589478, "learning_rate": 1.6386147857685753e-06, "loss": 0.0081, "reward": 2.489062547683716, "reward_std": 0.9769197106361389, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48906248807907104, "step": 3649 }, { "completion_length": 131.84375, "epoch": 1.952915997859818, "grad_norm": 0.41791898012161255, "kl": 0.1633654534816742, "learning_rate": 1.6371537846347225e-06, "loss": 0.0065, "reward": 2.3405938148498535, "reward_std": 0.8227587938308716, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48121875524520874, "step": 3650 }, { "completion_length": 158.09375, "epoch": 1.9534510433386838, "grad_norm": 0.9670942425727844, "kl": 0.15202519297599792, "learning_rate": 1.635693117956598e-06, "loss": 0.0061, "reward": 1.2829999923706055, "reward_std": 0.6527261734008789, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.37674999237060547, "step": 3651 }, { "completion_length": 130.84375, "epoch": 1.9539860888175495, "grad_norm": 0.8946314454078674, "kl": 0.22567309439182281, "learning_rate": 1.6342327863003825e-06, "loss": 0.009, "reward": 1.8984375, "reward_std": 0.9482024312019348, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4453125, "step": 3652 }, { "completion_length": 134.5, "epoch": 1.954521134296415, "grad_norm": 1.0006941556930542, "kl": 0.19844922423362732, "learning_rate": 1.63277279023213e-06, "loss": 0.0079, "reward": 2.17578125, "reward_std": 0.36011236906051636, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48828125, "step": 3653 }, { "completion_length": 124.46875, "epoch": 1.9550561797752808, "grad_norm": 0.9201217293739319, "kl": 0.16404178738594055, "learning_rate": 1.6313131303177626e-06, "loss": 0.0066, "reward": 2.2379374504089355, "reward_std": 0.565012514591217, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4723125100135803, "step": 3654 }, { "completion_length": 143.1875, "epoch": 1.9555912252541465, "grad_norm": 1.1895644664764404, "kl": 0.1845639944076538, "learning_rate": 1.6298538071230712e-06, "loss": 0.0074, "reward": 1.5044375658035278, "reward_std": 0.9600841999053955, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.42631250619888306, "step": 3655 }, { "completion_length": 123.84375, "epoch": 1.9561262707330123, "grad_norm": 0.8990504145622253, "kl": 0.1739211529493332, "learning_rate": 1.6283948212137197e-06, "loss": 0.007, "reward": 1.9821875095367432, "reward_std": 0.7764357328414917, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46656250953674316, "step": 3656 }, { "completion_length": 133.5, "epoch": 1.956661316211878, "grad_norm": 0.6706025004386902, "kl": 0.13468286395072937, "learning_rate": 1.6269361731552369e-06, "loss": 0.0054, "reward": 2.1872501373291016, "reward_std": 0.5619874596595764, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.452875018119812, "step": 3657 }, { "completion_length": 131.625, "epoch": 1.9571963616907437, "grad_norm": 1.5508908033370972, "kl": 0.1780124306678772, "learning_rate": 1.6254778635130237e-06, "loss": 0.0071, "reward": 1.9120311737060547, "reward_std": 0.6909515261650085, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.45890626311302185, "step": 3658 }, { "completion_length": 131.0, "epoch": 1.9577314071696095, "grad_norm": 1.200660228729248, "kl": 0.2000758945941925, "learning_rate": 1.6240198928523487e-06, "loss": 0.008, "reward": 2.58021879196167, "reward_std": 0.6088744401931763, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48646876215934753, "step": 3659 }, { "completion_length": 121.25, "epoch": 1.9582664526484752, "grad_norm": 1.0000355243682861, "kl": 0.20159253478050232, "learning_rate": 1.6225622617383494e-06, "loss": 0.0081, "reward": 2.515625, "reward_std": 0.5100911855697632, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.5, "step": 3660 }, { "completion_length": 119.40625, "epoch": 1.958801498127341, "grad_norm": 0.7952340245246887, "kl": 0.19071319699287415, "learning_rate": 1.6211049707360317e-06, "loss": 0.0076, "reward": 2.335124969482422, "reward_std": 0.7578139901161194, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.49137499928474426, "step": 3661 }, { "completion_length": 129.8125, "epoch": 1.9593365436062067, "grad_norm": 1.293800950050354, "kl": 0.22182349860668182, "learning_rate": 1.6196480204102687e-06, "loss": 0.0089, "reward": 1.9676563739776611, "reward_std": 1.144418478012085, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4520312547683716, "step": 3662 }, { "completion_length": 123.40625, "epoch": 1.9598715890850722, "grad_norm": 1.152172565460205, "kl": 0.2623233199119568, "learning_rate": 1.6181914113258029e-06, "loss": 0.0105, "reward": 2.421875, "reward_std": 0.4761107861995697, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.5, "step": 3663 }, { "completion_length": 116.5625, "epoch": 1.960406634563938, "grad_norm": 0.718845546245575, "kl": 0.158922016620636, "learning_rate": 1.6167351440472445e-06, "loss": 0.0064, "reward": 2.478250026702881, "reward_std": 0.7011743187904358, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.47824999690055847, "step": 3664 }, { "completion_length": 137.09375, "epoch": 1.9609416800428037, "grad_norm": 2.047797441482544, "kl": 0.21211013197898865, "learning_rate": 1.6152792191390692e-06, "loss": 0.0085, "reward": 1.9427812099456787, "reward_std": 1.0128358602523804, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4584062397480011, "step": 3665 }, { "completion_length": 127.53125, "epoch": 1.9614767255216692, "grad_norm": 0.6953848004341125, "kl": 0.15438544750213623, "learning_rate": 1.6138236371656233e-06, "loss": 0.0062, "reward": 2.39453125, "reward_std": 1.0665141344070435, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48828125, "step": 3666 }, { "completion_length": 137.25, "epoch": 1.962011771000535, "grad_norm": 0.7924994826316833, "kl": 0.16201075911521912, "learning_rate": 1.6123683986911153e-06, "loss": 0.0065, "reward": 1.522937536239624, "reward_std": 0.7688347101211548, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.44481247663497925, "step": 3667 }, { "completion_length": 107.9375, "epoch": 1.9625468164794007, "grad_norm": 0.9704720973968506, "kl": 0.24516478180885315, "learning_rate": 1.610913504279626e-06, "loss": 0.0098, "reward": 2.0471248626708984, "reward_std": 0.9420830011367798, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4846250116825104, "step": 3668 }, { "completion_length": 134.90625, "epoch": 1.9630818619582664, "grad_norm": 1.3521491289138794, "kl": 0.19454097747802734, "learning_rate": 1.609458954495101e-06, "loss": 0.0078, "reward": 2.17578125, "reward_std": 0.9762197136878967, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.48828125, "step": 3669 }, { "completion_length": 126.125, "epoch": 1.9636169074371321, "grad_norm": 1.0375392436981201, "kl": 0.18571743369102478, "learning_rate": 1.6080047499013484e-06, "loss": 0.0074, "reward": 1.7268438339233398, "reward_std": 0.9000155329704285, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4455937445163727, "step": 3670 }, { "completion_length": 143.71875, "epoch": 1.9641519529159979, "grad_norm": 0.8060900568962097, "kl": 0.15189027786254883, "learning_rate": 1.6065508910620482e-06, "loss": 0.0061, "reward": 2.201218605041504, "reward_std": 0.6975598335266113, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.48246875405311584, "step": 3671 }, { "completion_length": 130.1875, "epoch": 1.9646869983948636, "grad_norm": 34118304.0, "kl": 1643355.625, "learning_rate": 1.6050973785407447e-06, "loss": 65734.2266, "reward": 2.477781295776367, "reward_std": 0.8228890895843506, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4777812361717224, "step": 3672 }, { "completion_length": 137.84375, "epoch": 1.9652220438737293, "grad_norm": 0.6552762985229492, "kl": 0.17209050059318542, "learning_rate": 1.6036442129008446e-06, "loss": 0.0069, "reward": 1.9735937118530273, "reward_std": 0.8265457153320312, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4579687714576721, "step": 3673 }, { "completion_length": 117.125, "epoch": 1.965757089352595, "grad_norm": 0.5671567320823669, "kl": 0.1507929116487503, "learning_rate": 1.602191394705625e-06, "loss": 0.006, "reward": 2.1001875400543213, "reward_std": 0.523503303527832, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.4751875102519989, "step": 3674 }, { "completion_length": 143.8125, "epoch": 1.9662921348314608, "grad_norm": 0.9010342359542847, "kl": 0.1722821742296219, "learning_rate": 1.6007389245182242e-06, "loss": 0.0069, "reward": 2.640625, "reward_std": 0.5731281042098999, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.5, "step": 3675 }, { "completion_length": 127.25, "epoch": 1.9668271803103263, "grad_norm": 5.179325103759766, "kl": 0.19563260674476624, "learning_rate": 1.5992868029016489e-06, "loss": 0.0078, "reward": 2.543875217437744, "reward_std": 0.7892325520515442, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4970000088214874, "step": 3676 }, { "completion_length": 118.65625, "epoch": 1.967362225789192, "grad_norm": 0.7054325938224792, "kl": 0.26620104908943176, "learning_rate": 1.597835030418769e-06, "loss": 0.0106, "reward": 2.241593837738037, "reward_std": 0.36866819858551025, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.47596874833106995, "step": 3677 }, { "completion_length": 145.78125, "epoch": 1.9678972712680578, "grad_norm": 0.9695895910263062, "kl": 0.14240235090255737, "learning_rate": 1.5963836076323188e-06, "loss": 0.0057, "reward": 2.6058125495910645, "reward_std": 0.8812978267669678, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4651874899864197, "step": 3678 }, { "completion_length": 127.875, "epoch": 1.9684323167469235, "grad_norm": 1.3890516757965088, "kl": 0.22726386785507202, "learning_rate": 1.5949325351048984e-06, "loss": 0.0091, "reward": 1.8837499618530273, "reward_std": 0.6424785852432251, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47749999165534973, "step": 3679 }, { "completion_length": 113.75, "epoch": 1.968967362225789, "grad_norm": 1.130650281906128, "kl": 0.19346389174461365, "learning_rate": 1.5934818133989704e-06, "loss": 0.0077, "reward": 2.31040620803833, "reward_std": 0.5023197531700134, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48228126764297485, "step": 3680 }, { "completion_length": 107.75, "epoch": 1.9695024077046548, "grad_norm": 7023.0986328125, "kl": 958.933349609375, "learning_rate": 1.592031443076863e-06, "loss": 38.3573, "reward": 1.828125, "reward_std": 0.7018800973892212, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.5, "step": 3681 }, { "completion_length": 127.34375, "epoch": 1.9700374531835205, "grad_norm": 2.0239014625549316, "kl": 0.2194066196680069, "learning_rate": 1.5905814247007673e-06, "loss": 0.0088, "reward": 1.71875, "reward_std": 0.9511511325836182, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.453125, "step": 3682 }, { "completion_length": 138.21875, "epoch": 1.9705724986623863, "grad_norm": 1.1777976751327515, "kl": 0.30764657258987427, "learning_rate": 1.5891317588327382e-06, "loss": 0.0123, "reward": 2.0892186164855957, "reward_std": 0.8641878366470337, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47984373569488525, "step": 3683 }, { "completion_length": 135.78125, "epoch": 1.971107544141252, "grad_norm": 2.423325538635254, "kl": 0.17534607648849487, "learning_rate": 1.587682446034694e-06, "loss": 0.007, "reward": 2.5466251373291016, "reward_std": 0.6969402432441711, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.484125018119812, "step": 3684 }, { "completion_length": 154.90625, "epoch": 1.9716425896201177, "grad_norm": 1.0448987483978271, "kl": 0.1599825918674469, "learning_rate": 1.586233486868416e-06, "loss": 0.0064, "reward": 1.3655312061309814, "reward_std": 0.7452225685119629, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4124062657356262, "step": 3685 }, { "completion_length": 121.21875, "epoch": 1.9721776350989835, "grad_norm": 0.9276245832443237, "kl": 0.18804600834846497, "learning_rate": 1.5847848818955486e-06, "loss": 0.0075, "reward": 2.1343436241149902, "reward_std": 0.5951799154281616, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4780937433242798, "step": 3686 }, { "completion_length": 142.5, "epoch": 1.9727126805778492, "grad_norm": 1.0127400159835815, "kl": 0.14743570983409882, "learning_rate": 1.5833366316775994e-06, "loss": 0.0059, "reward": 2.4851250648498535, "reward_std": 0.8520947694778442, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.43825000524520874, "step": 3687 }, { "completion_length": 120.34375, "epoch": 1.973247726056715, "grad_norm": 1.7908180952072144, "kl": 0.3179199695587158, "learning_rate": 1.5818887367759367e-06, "loss": 0.0127, "reward": 1.9591562747955322, "reward_std": 0.7136178016662598, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4904062747955322, "step": 3688 }, { "completion_length": 122.375, "epoch": 1.9737827715355807, "grad_norm": 1.450579285621643, "kl": 0.1832388937473297, "learning_rate": 1.5804411977517942e-06, "loss": 0.0073, "reward": 3.027125120162964, "reward_std": 0.8777536153793335, "rewards/correctness_reward_func": 1.6875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4802500009536743, "step": 3689 }, { "completion_length": 141.03125, "epoch": 1.9743178170144462, "grad_norm": 1.4174232482910156, "kl": 0.31909865140914917, "learning_rate": 1.578994015166263e-06, "loss": 0.0128, "reward": 2.1652188301086426, "reward_std": 0.9213415384292603, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4308437407016754, "step": 3690 }, { "completion_length": 147.15625, "epoch": 1.974852862493312, "grad_norm": 1.0080829858779907, "kl": 0.29319900274276733, "learning_rate": 1.577547189580301e-06, "loss": 0.0117, "reward": 1.796875, "reward_std": 0.8669509291648865, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.453125, "step": 3691 }, { "completion_length": 130.9375, "epoch": 1.9753879079721777, "grad_norm": 3.488245725631714, "kl": 0.30211055278778076, "learning_rate": 1.576100721554726e-06, "loss": 0.0121, "reward": 2.29715633392334, "reward_std": 0.9556587934494019, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4534062445163727, "step": 3692 }, { "completion_length": 132.625, "epoch": 1.9759229534510432, "grad_norm": 0.954774022102356, "kl": 0.23367106914520264, "learning_rate": 1.574654611650214e-06, "loss": 0.0093, "reward": 1.4903125762939453, "reward_std": 0.5561014413833618, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.45906248688697815, "step": 3693 }, { "completion_length": 124.5625, "epoch": 1.976457998929909, "grad_norm": 0.431941956281662, "kl": 0.1756184697151184, "learning_rate": 1.5732088604273082e-06, "loss": 0.007, "reward": 2.4212188720703125, "reward_std": 0.6093875765800476, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.45246875286102295, "step": 3694 }, { "completion_length": 126.65625, "epoch": 1.9769930444087747, "grad_norm": 1.1556432247161865, "kl": 0.20442640781402588, "learning_rate": 1.5717634684464067e-06, "loss": 0.0082, "reward": 2.253781318664551, "reward_std": 0.8851593732833862, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.488156259059906, "step": 3695 }, { "completion_length": 137.90625, "epoch": 1.9775280898876404, "grad_norm": 0.8639407753944397, "kl": 0.20821744203567505, "learning_rate": 1.570318436267772e-06, "loss": 0.0083, "reward": 2.1982498168945312, "reward_std": 0.9891554117202759, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4482499957084656, "step": 3696 }, { "completion_length": 132.90625, "epoch": 1.9780631353665061, "grad_norm": 1.415555715560913, "kl": 0.1419951468706131, "learning_rate": 1.5688737644515279e-06, "loss": 0.0057, "reward": 2.1343436241149902, "reward_std": 1.0057426691055298, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4468437433242798, "step": 3697 }, { "completion_length": 159.625, "epoch": 1.9785981808453719, "grad_norm": 0.800277829170227, "kl": 0.14868506789207458, "learning_rate": 1.5674294535576543e-06, "loss": 0.0059, "reward": 1.78125, "reward_std": 0.4808991253376007, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.453125, "step": 3698 }, { "completion_length": 128.875, "epoch": 1.9791332263242376, "grad_norm": 1.2178566455841064, "kl": 0.19214478135108948, "learning_rate": 1.5659855041459957e-06, "loss": 0.0077, "reward": 2.266531229019165, "reward_std": 1.0255440473556519, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4540312588214874, "step": 3699 }, { "completion_length": 155.46875, "epoch": 1.9796682718031033, "grad_norm": 0.8972702026367188, "kl": 0.17254482209682465, "learning_rate": 1.5645419167762532e-06, "loss": 0.0069, "reward": 1.4328436851501465, "reward_std": 0.8992109298706055, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.40159374475479126, "step": 3700 }, { "completion_length": 161.8125, "epoch": 1.980203317281969, "grad_norm": 1.1629760265350342, "kl": 0.1578083634376526, "learning_rate": 1.5630986920079899e-06, "loss": 0.0063, "reward": 1.3889687061309814, "reward_std": 0.6161400079727173, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.21875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.3577187657356262, "step": 3701 }, { "completion_length": 117.84375, "epoch": 1.9807383627608348, "grad_norm": 0.9898071885108948, "kl": 0.15837153792381287, "learning_rate": 1.5616558304006274e-06, "loss": 0.0063, "reward": 1.9922187328338623, "reward_std": 0.4262726902961731, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4922187626361847, "step": 3702 }, { "completion_length": 129.5, "epoch": 1.9812734082397003, "grad_norm": 0.7476096153259277, "kl": 0.1850106418132782, "learning_rate": 1.5602133325134455e-06, "loss": 0.0074, "reward": 2.3219375610351562, "reward_std": 0.7254916429519653, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4781875014305115, "step": 3703 }, { "completion_length": 115.3125, "epoch": 1.981808453718566, "grad_norm": 15.141971588134766, "kl": 1.0195566415786743, "learning_rate": 1.5587711989055853e-06, "loss": 0.0408, "reward": 2.2609686851501465, "reward_std": 0.7444140315055847, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47971874475479126, "step": 3704 }, { "completion_length": 112.40625, "epoch": 1.9823434991974318, "grad_norm": 2.2923636436462402, "kl": 0.25476518273353577, "learning_rate": 1.5573294301360442e-06, "loss": 0.0102, "reward": 2.491187572479248, "reward_std": 0.7019106149673462, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.49118751287460327, "step": 3705 }, { "completion_length": 119.375, "epoch": 1.9828785446762975, "grad_norm": 2.167358160018921, "kl": 0.30374807119369507, "learning_rate": 1.55588802676368e-06, "loss": 0.0121, "reward": 2.6840312480926514, "reward_std": 0.7009077072143555, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48090624809265137, "step": 3706 }, { "completion_length": 161.75, "epoch": 1.983413590155163, "grad_norm": 1.2136915922164917, "kl": 0.13232970237731934, "learning_rate": 1.5544469893472092e-06, "loss": 0.0053, "reward": 2.0517187118530273, "reward_std": 1.1619677543640137, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4110937714576721, "step": 3707 }, { "completion_length": 109.28125, "epoch": 1.9839486356340288, "grad_norm": 1.6504706144332886, "kl": 0.22260558605194092, "learning_rate": 1.5530063184452037e-06, "loss": 0.0089, "reward": 2.2771248817443848, "reward_std": 0.644136905670166, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4802500009536743, "step": 3708 }, { "completion_length": 136.25, "epoch": 1.9844836811128945, "grad_norm": 2.4306912422180176, "kl": 0.16187244653701782, "learning_rate": 1.551566014616096e-06, "loss": 0.0065, "reward": 2.0507187843322754, "reward_std": 0.8723176121711731, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.410093754529953, "step": 3709 }, { "completion_length": 133.0, "epoch": 1.9850187265917603, "grad_norm": 0.24510447680950165, "kl": 0.14187216758728027, "learning_rate": 1.5501260784181763e-06, "loss": 0.0057, "reward": 2.425874948501587, "reward_std": 0.39630961418151855, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4415000081062317, "step": 3710 }, { "completion_length": 127.1875, "epoch": 1.985553772070626, "grad_norm": 0.35065898299217224, "kl": 0.1528996378183365, "learning_rate": 1.5486865104095898e-06, "loss": 0.0061, "reward": 2.6875, "reward_std": 0.3597595691680908, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.46875, "step": 3711 }, { "completion_length": 129.90625, "epoch": 1.9860888175494917, "grad_norm": 1.9096006155014038, "kl": 0.19099849462509155, "learning_rate": 1.5472473111483428e-06, "loss": 0.0076, "reward": 1.903937578201294, "reward_std": 0.9444037675857544, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4664374887943268, "step": 3712 }, { "completion_length": 151.96875, "epoch": 1.9866238630283575, "grad_norm": 1.2020857334136963, "kl": 0.3244999945163727, "learning_rate": 1.5458084811922935e-06, "loss": 0.013, "reward": 1.300531268119812, "reward_std": 0.7937700748443604, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.409906268119812, "step": 3713 }, { "completion_length": 145.59375, "epoch": 1.9871589085072232, "grad_norm": 0.7434720396995544, "kl": 0.1534394919872284, "learning_rate": 1.5443700210991625e-06, "loss": 0.0061, "reward": 1.7441563606262207, "reward_std": 0.855674147605896, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.41603124141693115, "step": 3714 }, { "completion_length": 109.3125, "epoch": 1.987693953986089, "grad_norm": 0.8535357713699341, "kl": 0.224266916513443, "learning_rate": 1.5429319314265245e-06, "loss": 0.009, "reward": 2.80078125, "reward_std": 0.9497029185295105, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.48828125, "step": 3715 }, { "completion_length": 120.5625, "epoch": 1.9882289994649547, "grad_norm": 0.6557080149650574, "kl": 0.23207300901412964, "learning_rate": 1.5414942127318083e-06, "loss": 0.0093, "reward": 2.2730000019073486, "reward_std": 0.4495435357093811, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46050000190734863, "step": 3716 }, { "completion_length": 128.96875, "epoch": 1.9887640449438202, "grad_norm": 0.6913798451423645, "kl": 0.14181645214557648, "learning_rate": 1.5400568655723044e-06, "loss": 0.0057, "reward": 2.112187385559082, "reward_std": 1.0407588481903076, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4559375047683716, "step": 3717 }, { "completion_length": 150.28125, "epoch": 1.989299090422686, "grad_norm": 0.699723482131958, "kl": 0.12245437502861023, "learning_rate": 1.5386198905051532e-06, "loss": 0.0049, "reward": 1.610687494277954, "reward_std": 0.7758801579475403, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4388124942779541, "step": 3718 }, { "completion_length": 129.96875, "epoch": 1.9898341359015517, "grad_norm": 1.0651721954345703, "kl": 0.17715419828891754, "learning_rate": 1.5371832880873547e-06, "loss": 0.0071, "reward": 1.550187587738037, "reward_std": 0.7442049980163574, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.45643749833106995, "step": 3719 }, { "completion_length": 142.40625, "epoch": 1.9903691813804172, "grad_norm": 1.1673451662063599, "kl": 0.19733425974845886, "learning_rate": 1.5357470588757654e-06, "loss": 0.0079, "reward": 1.8865938186645508, "reward_std": 0.8528186082839966, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.464718759059906, "step": 3720 }, { "completion_length": 120.1875, "epoch": 1.990904226859283, "grad_norm": 0.758076012134552, "kl": 0.15272650122642517, "learning_rate": 1.5343112034270924e-06, "loss": 0.0061, "reward": 2.48828125, "reward_std": 0.7014518976211548, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48828125, "step": 3721 }, { "completion_length": 147.90625, "epoch": 1.9914392723381487, "grad_norm": 1.4781582355499268, "kl": 0.14983013272285461, "learning_rate": 1.5328757222979027e-06, "loss": 0.006, "reward": 1.745593786239624, "reward_std": 0.7737661600112915, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.46434375643730164, "step": 3722 }, { "completion_length": 126.03125, "epoch": 1.9919743178170144, "grad_norm": 1.2097634077072144, "kl": 0.2055349349975586, "learning_rate": 1.531440616044615e-06, "loss": 0.0082, "reward": 2.109250068664551, "reward_std": 0.5898415446281433, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.499875009059906, "step": 3723 }, { "completion_length": 132.25, "epoch": 1.9925093632958801, "grad_norm": 0.6664714813232422, "kl": 0.17538659274578094, "learning_rate": 1.530005885223505e-06, "loss": 0.007, "reward": 2.5450000762939453, "reward_std": 0.5651928782463074, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48250001668930054, "step": 3724 }, { "completion_length": 131.21875, "epoch": 1.9930444087747459, "grad_norm": 1.2543714046478271, "kl": 0.1886204183101654, "learning_rate": 1.5285715303907017e-06, "loss": 0.0075, "reward": 1.6231250762939453, "reward_std": 0.8700497150421143, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.42000001668930054, "step": 3725 }, { "completion_length": 123.75, "epoch": 1.9935794542536116, "grad_norm": 0.4234620928764343, "kl": 0.14129173755645752, "learning_rate": 1.527137552102188e-06, "loss": 0.0057, "reward": 2.814500093460083, "reward_std": 0.8580809831619263, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47074997425079346, "step": 3726 }, { "completion_length": 117.9375, "epoch": 1.9941144997324773, "grad_norm": 1.1403589248657227, "kl": 0.19204038381576538, "learning_rate": 1.5257039509138022e-06, "loss": 0.0077, "reward": 2.1341874599456787, "reward_std": 0.5083947777748108, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4623124897480011, "step": 3727 }, { "completion_length": 130.375, "epoch": 1.994649545211343, "grad_norm": 0.804774284362793, "kl": 0.17178577184677124, "learning_rate": 1.5242707273812342e-06, "loss": 0.0069, "reward": 1.94921875, "reward_std": 0.7223412990570068, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46484375, "step": 3728 }, { "completion_length": 117.40625, "epoch": 1.9951845906902088, "grad_norm": 1.4482876062393188, "kl": 0.1685350388288498, "learning_rate": 1.5228378820600303e-06, "loss": 0.0067, "reward": 2.328125, "reward_std": 0.738341212272644, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.5, "step": 3729 }, { "completion_length": 128.375, "epoch": 1.9957196361690743, "grad_norm": 1.8960968255996704, "kl": 0.4402351975440979, "learning_rate": 1.5214054155055885e-06, "loss": 0.0176, "reward": 1.9299062490463257, "reward_std": 0.8945124745368958, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4611562490463257, "step": 3730 }, { "completion_length": 124.5625, "epoch": 1.99625468164794, "grad_norm": 0.8178673386573792, "kl": 0.17874480783939362, "learning_rate": 1.5199733282731597e-06, "loss": 0.0071, "reward": 2.3102500438690186, "reward_std": 0.8007888793945312, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4664999842643738, "step": 3731 }, { "completion_length": 137.3125, "epoch": 1.9967897271268058, "grad_norm": 0.8240591883659363, "kl": 0.1942169964313507, "learning_rate": 1.5185416209178497e-06, "loss": 0.0078, "reward": 2.0006561279296875, "reward_std": 0.7161298990249634, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.43815624713897705, "step": 3732 }, { "completion_length": 128.625, "epoch": 1.9973247726056713, "grad_norm": 1.0364766120910645, "kl": 0.1667904257774353, "learning_rate": 1.517110293994613e-06, "loss": 0.0067, "reward": 2.2180938720703125, "reward_std": 0.47341763973236084, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.43684375286102295, "step": 3733 }, { "completion_length": 101.96875, "epoch": 1.997859818084537, "grad_norm": 1.2267522811889648, "kl": 0.35487988591194153, "learning_rate": 1.515679348058262e-06, "loss": 0.0142, "reward": 1.9483437538146973, "reward_std": 0.6058109402656555, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.49521875381469727, "step": 3734 }, { "completion_length": 111.15625, "epoch": 1.9983948635634028, "grad_norm": 0.4344557225704193, "kl": 0.17865943908691406, "learning_rate": 1.5142487836634588e-06, "loss": 0.0071, "reward": 1.8711562156677246, "reward_std": 0.6389785408973694, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4649062752723694, "step": 3735 }, { "completion_length": 122.03125, "epoch": 1.9989299090422685, "grad_norm": 0.7710269689559937, "kl": 0.13756774365901947, "learning_rate": 1.512818601364715e-06, "loss": 0.0055, "reward": 1.572812557220459, "reward_std": 0.577293872833252, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4321874976158142, "step": 3736 }, { "completion_length": 127.25, "epoch": 1.9994649545211343, "grad_norm": 1.9683376550674438, "kl": 0.15132971107959747, "learning_rate": 1.5113888017163997e-06, "loss": 0.0061, "reward": 2.42578125, "reward_std": 0.4762299060821533, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48828125, "step": 3737 }, { "completion_length": 159.0, "epoch": 2.0, "grad_norm": 0.6582648158073425, "kl": 0.11374945938587189, "learning_rate": 1.5099593852727284e-06, "loss": 0.0045, "reward": 3.5, "reward_std": 1.4345811605453491, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 3738 }, { "completion_length": 141.90625, "epoch": 2.0005350454788657, "grad_norm": 1.314542531967163, "kl": 0.14934374392032623, "learning_rate": 1.5085303525877704e-06, "loss": 0.006, "reward": 1.7522499561309814, "reward_std": 0.9182028770446777, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.43974998593330383, "step": 3739 }, { "completion_length": 135.1875, "epoch": 2.0010700909577315, "grad_norm": 1.883758544921875, "kl": 0.17459836602210999, "learning_rate": 1.5071017042154485e-06, "loss": 0.007, "reward": 1.7279062271118164, "reward_std": 0.5705491900444031, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4466562867164612, "step": 3740 }, { "completion_length": 126.25, "epoch": 2.001605136436597, "grad_norm": 23.234310150146484, "kl": 0.4833563566207886, "learning_rate": 1.5056734407095316e-06, "loss": 0.0193, "reward": 2.17578125, "reward_std": 0.8321325182914734, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48828125, "step": 3741 }, { "completion_length": 125.1875, "epoch": 2.002140181915463, "grad_norm": 0.8825715780258179, "kl": 0.20616546273231506, "learning_rate": 1.5042455626236424e-06, "loss": 0.0082, "reward": 1.813906192779541, "reward_std": 0.80748450756073, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4701562523841858, "step": 3742 }, { "completion_length": 135.46875, "epoch": 2.0026752273943287, "grad_norm": 0.4268606901168823, "kl": 0.13839495182037354, "learning_rate": 1.5028180705112561e-06, "loss": 0.0055, "reward": 1.736687421798706, "reward_std": 0.6494581699371338, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.47106248140335083, "step": 3743 }, { "completion_length": 122.78125, "epoch": 2.0032102728731944, "grad_norm": 1.5616717338562012, "kl": 0.17566600441932678, "learning_rate": 1.501390964925693e-06, "loss": 0.007, "reward": 2.2578125, "reward_std": 0.6234082579612732, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4296875, "step": 3744 }, { "completion_length": 129.46875, "epoch": 2.0037453183520597, "grad_norm": 0.7017888426780701, "kl": 0.13164713978767395, "learning_rate": 1.4999642464201287e-06, "loss": 0.0053, "reward": 2.06640625, "reward_std": 1.0543545484542847, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48828125, "step": 3745 }, { "completion_length": 132.21875, "epoch": 2.0042803638309254, "grad_norm": 52.766136169433594, "kl": 0.5214701890945435, "learning_rate": 1.498537915547585e-06, "loss": 0.0209, "reward": 2.568906307220459, "reward_std": 0.9284114241600037, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.3814062476158142, "step": 3746 }, { "completion_length": 137.125, "epoch": 2.004815409309791, "grad_norm": 0.9280478954315186, "kl": 0.183735653758049, "learning_rate": 1.497111972860936e-06, "loss": 0.0073, "reward": 1.9945937395095825, "reward_std": 0.5358538031578064, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4789687395095825, "step": 3747 }, { "completion_length": 120.90625, "epoch": 2.005350454788657, "grad_norm": 2.4782514572143555, "kl": 0.534712553024292, "learning_rate": 1.4956864189129044e-06, "loss": 0.0214, "reward": 1.7088749408721924, "reward_std": 0.6430697441101074, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.45887500047683716, "step": 3748 }, { "completion_length": 130.90625, "epoch": 2.0058855002675227, "grad_norm": 1.028380274772644, "kl": 0.21434298157691956, "learning_rate": 1.4942612542560622e-06, "loss": 0.0086, "reward": 1.741031289100647, "reward_std": 0.701640248298645, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4597812294960022, "step": 3749 }, { "completion_length": 123.59375, "epoch": 2.0064205457463884, "grad_norm": 1.5632193088531494, "kl": 0.1605345904827118, "learning_rate": 1.4928364794428307e-06, "loss": 0.0064, "reward": 2.044250011444092, "reward_std": 0.7342772483825684, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4504999816417694, "step": 3750 }, { "completion_length": 163.71875, "epoch": 2.006955591225254, "grad_norm": 0.6766614317893982, "kl": 0.11533248424530029, "learning_rate": 1.491412095025479e-06, "loss": 0.0046, "reward": 1.6501250267028809, "reward_std": 0.9342430830001831, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.3688749670982361, "step": 3751 }, { "completion_length": 138.6875, "epoch": 2.00749063670412, "grad_norm": 0.63933265209198, "kl": 0.13436278700828552, "learning_rate": 1.4899881015561265e-06, "loss": 0.0054, "reward": 2.107062339782715, "reward_std": 0.8408859968185425, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.43518751859664917, "step": 3752 }, { "completion_length": 132.53125, "epoch": 2.0080256821829856, "grad_norm": 1.5206379890441895, "kl": 0.17666155099868774, "learning_rate": 1.4885644995867409e-06, "loss": 0.0071, "reward": 2.2805938720703125, "reward_std": 0.9418667554855347, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.43684375286102295, "step": 3753 }, { "completion_length": 158.375, "epoch": 2.0085607276618513, "grad_norm": 1.2952529191970825, "kl": 0.18357494473457336, "learning_rate": 1.4871412896691366e-06, "loss": 0.0073, "reward": 1.448062539100647, "reward_std": 0.5859041213989258, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4324374794960022, "step": 3754 }, { "completion_length": 130.71875, "epoch": 2.009095773140717, "grad_norm": 0.791262686252594, "kl": 0.1569252610206604, "learning_rate": 1.4857184723549779e-06, "loss": 0.0063, "reward": 2.0244998931884766, "reward_std": 0.8483060598373413, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4776250123977661, "step": 3755 }, { "completion_length": 134.6875, "epoch": 2.009630818619583, "grad_norm": 1.6790518760681152, "kl": 0.2023908495903015, "learning_rate": 1.4842960481957747e-06, "loss": 0.0081, "reward": 2.386218786239624, "reward_std": 0.9641432166099548, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.43309375643730164, "step": 3756 }, { "completion_length": 147.4375, "epoch": 2.0101658640984486, "grad_norm": 2.8755922317504883, "kl": 0.3443911373615265, "learning_rate": 1.4828740177428874e-06, "loss": 0.0138, "reward": 2.253812551498413, "reward_std": 0.9615557193756104, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4256874918937683, "step": 3757 }, { "completion_length": 150.15625, "epoch": 2.0107009095773143, "grad_norm": 0.5788829326629639, "kl": 0.136502206325531, "learning_rate": 1.4814523815475218e-06, "loss": 0.0055, "reward": 2.3676249980926514, "reward_std": 1.0162477493286133, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46137499809265137, "step": 3758 }, { "completion_length": 154.25, "epoch": 2.0112359550561796, "grad_norm": 1.4629111289978027, "kl": 0.24053946137428284, "learning_rate": 1.4800311401607303e-06, "loss": 0.0096, "reward": 1.9726874828338623, "reward_std": 0.7553264498710632, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.3789375126361847, "step": 3759 }, { "completion_length": 161.65625, "epoch": 2.0117710005350453, "grad_norm": 1.3599554300308228, "kl": 0.17111405730247498, "learning_rate": 1.4786102941334157e-06, "loss": 0.0068, "reward": 1.624750018119812, "reward_std": 0.9557234048843384, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.3903749883174896, "step": 3760 }, { "completion_length": 151.5625, "epoch": 2.012306046013911, "grad_norm": 1.2910364866256714, "kl": 0.17868992686271667, "learning_rate": 1.4771898440163223e-06, "loss": 0.0071, "reward": 1.8896563053131104, "reward_std": 0.7926764488220215, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4209062457084656, "step": 3761 }, { "completion_length": 126.59375, "epoch": 2.012841091492777, "grad_norm": 1.5219477415084839, "kl": 0.370620459318161, "learning_rate": 1.475769790360045e-06, "loss": 0.0148, "reward": 1.5943125486373901, "reward_std": 0.8789227604866028, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.43806248903274536, "step": 3762 }, { "completion_length": 125.09375, "epoch": 2.0133761369716425, "grad_norm": 0.6580132246017456, "kl": 0.13841691613197327, "learning_rate": 1.4743501337150246e-06, "loss": 0.0055, "reward": 2.2734375, "reward_std": 0.7405873537063599, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4765625, "step": 3763 }, { "completion_length": 140.5, "epoch": 2.0139111824505083, "grad_norm": 0.9917346835136414, "kl": 0.22934705018997192, "learning_rate": 1.472930874631546e-06, "loss": 0.0092, "reward": 1.9246875047683716, "reward_std": 0.8671627640724182, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4246875047683716, "step": 3764 }, { "completion_length": 144.28125, "epoch": 2.014446227929374, "grad_norm": 1.1846787929534912, "kl": 0.18332445621490479, "learning_rate": 1.4715120136597422e-06, "loss": 0.0073, "reward": 1.7921874523162842, "reward_std": 0.7212777137756348, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.44843751192092896, "step": 3765 }, { "completion_length": 104.625, "epoch": 2.0149812734082397, "grad_norm": 1.5039135217666626, "kl": 0.35202527046203613, "learning_rate": 1.4700935513495899e-06, "loss": 0.0141, "reward": 3.130781412124634, "reward_std": 0.600054144859314, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.49015626311302185, "step": 3766 }, { "completion_length": 142.75, "epoch": 2.0155163188871055, "grad_norm": 1.1046332120895386, "kl": 0.1512536257505417, "learning_rate": 1.4686754882509128e-06, "loss": 0.0061, "reward": 1.4073749780654907, "reward_std": 0.6123749613761902, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4698750078678131, "step": 3767 }, { "completion_length": 131.375, "epoch": 2.016051364365971, "grad_norm": 0.6708117723464966, "kl": 0.12609192728996277, "learning_rate": 1.4672578249133796e-06, "loss": 0.005, "reward": 1.9383749961853027, "reward_std": 0.9556742906570435, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46962499618530273, "step": 3768 }, { "completion_length": 146.03125, "epoch": 2.016586409844837, "grad_norm": 0.9137414693832397, "kl": 0.20079851150512695, "learning_rate": 1.4658405618865035e-06, "loss": 0.008, "reward": 1.78515625, "reward_std": 0.8922817707061768, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.44140625, "step": 3769 }, { "completion_length": 127.0, "epoch": 2.0171214553237027, "grad_norm": 0.8964540362358093, "kl": 0.18114545941352844, "learning_rate": 1.4644236997196428e-06, "loss": 0.0072, "reward": 1.3592500686645508, "reward_std": 0.6139646768569946, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.437375009059906, "step": 3770 }, { "completion_length": 143.65625, "epoch": 2.0176565008025684, "grad_norm": 0.4703572988510132, "kl": 0.14537449181079865, "learning_rate": 1.4630072389620003e-06, "loss": 0.0058, "reward": 2.66015625, "reward_std": 0.6588294506072998, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.44140625, "step": 3771 }, { "completion_length": 147.59375, "epoch": 2.0181915462814337, "grad_norm": 0.7558117508888245, "kl": 0.1551474630832672, "learning_rate": 1.4615911801626232e-06, "loss": 0.0062, "reward": 1.6193437576293945, "reward_std": 0.7562192678451538, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.43184375762939453, "step": 3772 }, { "completion_length": 120.34375, "epoch": 2.0187265917602994, "grad_norm": 1.6292728185653687, "kl": 0.17594411969184875, "learning_rate": 1.4601755238704041e-06, "loss": 0.007, "reward": 2.1304688453674316, "reward_std": 0.547926664352417, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4742187559604645, "step": 3773 }, { "completion_length": 105.53125, "epoch": 2.019261637239165, "grad_norm": 1.0592354536056519, "kl": 0.30204248428344727, "learning_rate": 1.458760270634077e-06, "loss": 0.0121, "reward": 2.341125011444092, "reward_std": 0.6175684928894043, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4817500114440918, "step": 3774 }, { "completion_length": 119.4375, "epoch": 2.019796682718031, "grad_norm": 0.7609295845031738, "kl": 0.15740808844566345, "learning_rate": 1.4573454210022223e-06, "loss": 0.0063, "reward": 2.5314061641693115, "reward_std": 0.647842288017273, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4689062237739563, "step": 3775 }, { "completion_length": 144.25, "epoch": 2.0203317281968967, "grad_norm": 0.6409999132156372, "kl": 0.12520377337932587, "learning_rate": 1.4559309755232626e-06, "loss": 0.005, "reward": 2.217968702316284, "reward_std": 0.8790438175201416, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.45234376192092896, "step": 3776 }, { "completion_length": 149.53125, "epoch": 2.0208667736757624, "grad_norm": 1.217834234237671, "kl": 0.1751442551612854, "learning_rate": 1.4545169347454627e-06, "loss": 0.007, "reward": 1.510812520980835, "reward_std": 0.7274977564811707, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4170624911785126, "step": 3777 }, { "completion_length": 113.96875, "epoch": 2.021401819154628, "grad_norm": 0.961595356464386, "kl": 0.1967834234237671, "learning_rate": 1.4531032992169341e-06, "loss": 0.0079, "reward": 2.5349998474121094, "reward_std": 0.7882871627807617, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4724999964237213, "step": 3778 }, { "completion_length": 123.46875, "epoch": 2.021936864633494, "grad_norm": 1.628678321838379, "kl": 0.1867743879556656, "learning_rate": 1.4516900694856278e-06, "loss": 0.0075, "reward": 1.9387812614440918, "reward_std": 0.6462781429290771, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4856562614440918, "step": 3779 }, { "completion_length": 124.0, "epoch": 2.0224719101123596, "grad_norm": 0.6867090463638306, "kl": 0.1775660514831543, "learning_rate": 1.4502772460993387e-06, "loss": 0.0071, "reward": 2.0100936889648438, "reward_std": 0.4175272583961487, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4788437485694885, "step": 3780 }, { "completion_length": 127.09375, "epoch": 2.0230069555912253, "grad_norm": 1.167968988418579, "kl": 0.19098597764968872, "learning_rate": 1.4488648296057046e-06, "loss": 0.0076, "reward": 1.9615936279296875, "reward_std": 0.25671184062957764, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47721874713897705, "step": 3781 }, { "completion_length": 121.1875, "epoch": 2.023542001070091, "grad_norm": 1.6666572093963623, "kl": 0.19044625759124756, "learning_rate": 1.4474528205522054e-06, "loss": 0.0076, "reward": 1.6768124103546143, "reward_std": 0.754180908203125, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4424374997615814, "step": 3782 }, { "completion_length": 135.96875, "epoch": 2.024077046548957, "grad_norm": 0.8178337216377258, "kl": 0.14295974373817444, "learning_rate": 1.4460412194861625e-06, "loss": 0.0057, "reward": 1.8685624599456787, "reward_std": 0.5874313116073608, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4310625195503235, "step": 3783 }, { "completion_length": 127.53125, "epoch": 2.0246120920278226, "grad_norm": 1.07879638671875, "kl": 0.13754725456237793, "learning_rate": 1.4446300269547387e-06, "loss": 0.0055, "reward": 1.4135937690734863, "reward_std": 0.41127341985702515, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46046876907348633, "step": 3784 }, { "completion_length": 126.34375, "epoch": 2.0251471375066883, "grad_norm": 1.6127632856369019, "kl": 0.1813584566116333, "learning_rate": 1.4432192435049407e-06, "loss": 0.0073, "reward": 1.9862186908721924, "reward_std": 0.8609797358512878, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.45496875047683716, "step": 3785 }, { "completion_length": 138.0, "epoch": 2.0256821829855536, "grad_norm": 0.6477506160736084, "kl": 0.21310701966285706, "learning_rate": 1.4418088696836147e-06, "loss": 0.0085, "reward": 2.261593818664551, "reward_std": 1.0052623748779297, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.464718759059906, "step": 3786 }, { "completion_length": 121.1875, "epoch": 2.0262172284644193, "grad_norm": 0.999031126499176, "kl": 0.28946948051452637, "learning_rate": 1.440398906037448e-06, "loss": 0.0116, "reward": 2.5705313682556152, "reward_std": 0.6852011680603027, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4767812490463257, "step": 3787 }, { "completion_length": 145.28125, "epoch": 2.026752273943285, "grad_norm": 1.6128690242767334, "kl": 0.22994446754455566, "learning_rate": 1.4389893531129706e-06, "loss": 0.0092, "reward": 1.6848437786102295, "reward_std": 0.9165247678756714, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.4348437786102295, "step": 3788 }, { "completion_length": 130.3125, "epoch": 2.027287319422151, "grad_norm": 31.5162410736084, "kl": 2.4116241931915283, "learning_rate": 1.4375802114565515e-06, "loss": 0.0965, "reward": 2.3989062309265137, "reward_std": 0.8002761602401733, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46140623092651367, "step": 3789 }, { "completion_length": 139.59375, "epoch": 2.0278223649010165, "grad_norm": 1.798748254776001, "kl": 0.2666778266429901, "learning_rate": 1.4361714816144007e-06, "loss": 0.0107, "reward": 1.1064375638961792, "reward_std": 0.3954417109489441, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4189375042915344, "step": 3790 }, { "completion_length": 126.9375, "epoch": 2.0283574103798823, "grad_norm": 0.6918255686759949, "kl": 0.16956481337547302, "learning_rate": 1.43476316413257e-06, "loss": 0.0068, "reward": 2.471874952316284, "reward_std": 0.7656055688858032, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47187501192092896, "step": 3791 }, { "completion_length": 141.3125, "epoch": 2.028892455858748, "grad_norm": 0.9490920305252075, "kl": 0.1457681655883789, "learning_rate": 1.4333552595569495e-06, "loss": 0.0058, "reward": 2.548874855041504, "reward_std": 0.7697238922119141, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48637500405311584, "step": 3792 }, { "completion_length": 135.84375, "epoch": 2.0294275013376137, "grad_norm": 0.9044893980026245, "kl": 0.2006089687347412, "learning_rate": 1.4319477684332705e-06, "loss": 0.008, "reward": 1.2180312871932983, "reward_std": 0.5407760143280029, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.42115625739097595, "step": 3793 }, { "completion_length": 124.5, "epoch": 2.0299625468164795, "grad_norm": 0.6371613144874573, "kl": 0.2071153223514557, "learning_rate": 1.4305406913071018e-06, "loss": 0.0083, "reward": 2.065406322479248, "reward_std": 0.5161699056625366, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.47165626287460327, "step": 3794 }, { "completion_length": 132.3125, "epoch": 2.030497592295345, "grad_norm": 1.0060240030288696, "kl": 0.12932726740837097, "learning_rate": 1.429134028723856e-06, "loss": 0.0052, "reward": 2.7321250438690186, "reward_std": 1.1529240608215332, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.45087501406669617, "step": 3795 }, { "completion_length": 142.90625, "epoch": 2.031032637774211, "grad_norm": 2.2218332290649414, "kl": 0.16034501791000366, "learning_rate": 1.427727781228781e-06, "loss": 0.0064, "reward": 1.9252500534057617, "reward_std": 1.0106127262115479, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47212499380111694, "step": 3796 }, { "completion_length": 119.84375, "epoch": 2.0315676832530767, "grad_norm": 0.9531065821647644, "kl": 0.250397264957428, "learning_rate": 1.4263219493669646e-06, "loss": 0.01, "reward": 1.8905625343322754, "reward_std": 0.42871570587158203, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.499937504529953, "step": 3797 }, { "completion_length": 127.9375, "epoch": 2.0321027287319424, "grad_norm": 0.9694652557373047, "kl": 0.17845410108566284, "learning_rate": 1.4249165336833373e-06, "loss": 0.0071, "reward": 1.7820625305175781, "reward_std": 0.7038744688034058, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4539375305175781, "step": 3798 }, { "completion_length": 132.40625, "epoch": 2.0326377742108077, "grad_norm": 0.761907696723938, "kl": 0.17570139467716217, "learning_rate": 1.4235115347226607e-06, "loss": 0.007, "reward": 2.1720314025878906, "reward_std": 0.9341155886650085, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4689062535762787, "step": 3799 }, { "completion_length": 145.53125, "epoch": 2.0331728196896734, "grad_norm": 1.1030139923095703, "kl": 0.2772740423679352, "learning_rate": 1.4221069530295428e-06, "loss": 0.0111, "reward": 1.5113437175750732, "reward_std": 0.9674907922744751, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.38634371757507324, "step": 3800 }, { "completion_length": 120.125, "epoch": 2.033707865168539, "grad_norm": 0.6328108310699463, "kl": 0.17972858250141144, "learning_rate": 1.4207027891484252e-06, "loss": 0.0072, "reward": 2.019124984741211, "reward_std": 0.24777555465698242, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4878750145435333, "step": 3801 }, { "completion_length": 138.0, "epoch": 2.034242910647405, "grad_norm": 2.2820048332214355, "kl": 0.1687937080860138, "learning_rate": 1.4192990436235877e-06, "loss": 0.0068, "reward": 2.4000937938690186, "reward_std": 0.7896637916564941, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4625937342643738, "step": 3802 }, { "completion_length": 152.46875, "epoch": 2.0347779561262707, "grad_norm": 1.0354551076889038, "kl": 0.183159738779068, "learning_rate": 1.4178957169991507e-06, "loss": 0.0073, "reward": 2.1838436126708984, "reward_std": 1.0029256343841553, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.465093731880188, "step": 3803 }, { "completion_length": 132.78125, "epoch": 2.0353130016051364, "grad_norm": 2.5265393257141113, "kl": 0.26580309867858887, "learning_rate": 1.4164928098190698e-06, "loss": 0.0106, "reward": 2.1794064044952393, "reward_std": 0.7827932834625244, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4762812554836273, "step": 3804 }, { "completion_length": 126.59375, "epoch": 2.035848047084002, "grad_norm": 71.87223815917969, "kl": 3.776397466659546, "learning_rate": 1.4150903226271375e-06, "loss": 0.1511, "reward": 2.013437509536743, "reward_std": 1.0508610010147095, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4509374797344208, "step": 3805 }, { "completion_length": 144.46875, "epoch": 2.036383092562868, "grad_norm": 1.7472124099731445, "kl": 0.26548564434051514, "learning_rate": 1.413688255966987e-06, "loss": 0.0106, "reward": 1.9636563062667847, "reward_std": 0.5417798757553101, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4324062466621399, "step": 3806 }, { "completion_length": 136.4375, "epoch": 2.0369181380417336, "grad_norm": 0.827923059463501, "kl": 0.17662015557289124, "learning_rate": 1.4122866103820853e-06, "loss": 0.0071, "reward": 2.231874942779541, "reward_std": 0.8540496826171875, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4506250023841858, "step": 3807 }, { "completion_length": 135.21875, "epoch": 2.0374531835205993, "grad_norm": 253.17623901367188, "kl": 10.039907455444336, "learning_rate": 1.4108853864157365e-06, "loss": 0.4016, "reward": 1.721562385559082, "reward_std": 0.6183880567550659, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4403125047683716, "step": 3808 }, { "completion_length": 129.59375, "epoch": 2.037988228999465, "grad_norm": 0.9825323224067688, "kl": 0.16389071941375732, "learning_rate": 1.4094845846110813e-06, "loss": 0.0066, "reward": 2.4282500743865967, "reward_std": 0.906906008720398, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4751250147819519, "step": 3809 }, { "completion_length": 126.25, "epoch": 2.038523274478331, "grad_norm": 1.984283685684204, "kl": 0.28350886702537537, "learning_rate": 1.4080842055110994e-06, "loss": 0.0113, "reward": 1.507906198501587, "reward_std": 0.8165614008903503, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4297812581062317, "step": 3810 }, { "completion_length": 131.4375, "epoch": 2.0390583199571966, "grad_norm": 3.2105276584625244, "kl": 0.22207435965538025, "learning_rate": 1.406684249658603e-06, "loss": 0.0089, "reward": 1.6056562662124634, "reward_std": 0.5682437419891357, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.433781236410141, "step": 3811 }, { "completion_length": 111.28125, "epoch": 2.039593365436062, "grad_norm": 2.3784029483795166, "kl": 0.2355738878250122, "learning_rate": 1.4052847175962416e-06, "loss": 0.0094, "reward": 2.3552498817443848, "reward_std": 0.9855897426605225, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4646250009536743, "step": 3812 }, { "completion_length": 125.375, "epoch": 2.0401284109149276, "grad_norm": 2017.1094970703125, "kl": 160.361083984375, "learning_rate": 1.4038856098665022e-06, "loss": 6.4144, "reward": 1.9885938167572021, "reward_std": 0.6360337734222412, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4885937571525574, "step": 3813 }, { "completion_length": 140.3125, "epoch": 2.0406634563937933, "grad_norm": 1.2265143394470215, "kl": 0.16335394978523254, "learning_rate": 1.4024869270117048e-06, "loss": 0.0065, "reward": 1.7496875524520874, "reward_std": 0.7175925970077515, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4059374928474426, "step": 3814 }, { "completion_length": 139.75, "epoch": 2.041198501872659, "grad_norm": 1.466209888458252, "kl": 0.19972194731235504, "learning_rate": 1.4010886695740051e-06, "loss": 0.008, "reward": 2.015625, "reward_std": 0.8701196908950806, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.453125, "step": 3815 }, { "completion_length": 105.15625, "epoch": 2.041733547351525, "grad_norm": 1.6003601551055908, "kl": 0.18960535526275635, "learning_rate": 1.3996908380953964e-06, "loss": 0.0076, "reward": 2.583031177520752, "reward_std": 0.9071453213691711, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47365623712539673, "step": 3816 }, { "completion_length": 137.5, "epoch": 2.0422685928303905, "grad_norm": 0.9168112874031067, "kl": 0.1328968107700348, "learning_rate": 1.398293433117702e-06, "loss": 0.0053, "reward": 1.7737812995910645, "reward_std": 0.9848059415817261, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4456562399864197, "step": 3817 }, { "completion_length": 99.6875, "epoch": 2.0428036383092563, "grad_norm": 0.7200464606285095, "kl": 0.15992356836795807, "learning_rate": 1.3968964551825842e-06, "loss": 0.0064, "reward": 2.296875, "reward_std": 0.30296874046325684, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.5, "step": 3818 }, { "completion_length": 136.0, "epoch": 2.043338683788122, "grad_norm": 0.7997047305107117, "kl": 0.15879854559898376, "learning_rate": 1.395499904831541e-06, "loss": 0.0064, "reward": 2.2363438606262207, "reward_std": 0.6389517784118652, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.42384374141693115, "step": 3819 }, { "completion_length": 146.53125, "epoch": 2.0438737292669877, "grad_norm": 5.7308759689331055, "kl": 0.6244599223136902, "learning_rate": 1.3941037826058978e-06, "loss": 0.025, "reward": 1.8367812633514404, "reward_std": 0.8199577331542969, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.41490626335144043, "step": 3820 }, { "completion_length": 116.46875, "epoch": 2.0444087747458535, "grad_norm": 0.7475249171257019, "kl": 0.17251721024513245, "learning_rate": 1.3927080890468215e-06, "loss": 0.0069, "reward": 2.833312511444092, "reward_std": 0.8153377771377563, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4739375114440918, "step": 3821 }, { "completion_length": 145.25, "epoch": 2.044943820224719, "grad_norm": 0.5995250940322876, "kl": 0.14741076529026031, "learning_rate": 1.3913128246953084e-06, "loss": 0.0059, "reward": 1.5063749551773071, "reward_std": 0.8997244834899902, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4282499849796295, "step": 3822 }, { "completion_length": 108.78125, "epoch": 2.045478865703585, "grad_norm": 1.048776626586914, "kl": 0.28423047065734863, "learning_rate": 1.3899179900921884e-06, "loss": 0.0114, "reward": 2.5829687118530273, "reward_std": 1.0416333675384521, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.45796874165534973, "step": 3823 }, { "completion_length": 140.71875, "epoch": 2.0460139111824507, "grad_norm": 0.7299221754074097, "kl": 0.17818546295166016, "learning_rate": 1.3885235857781288e-06, "loss": 0.0071, "reward": 1.3677812814712524, "reward_std": 0.706426203250885, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.39903128147125244, "step": 3824 }, { "completion_length": 118.1875, "epoch": 2.0465489566613164, "grad_norm": 5.399540424346924, "kl": 0.24490442872047424, "learning_rate": 1.3871296122936262e-06, "loss": 0.0098, "reward": 2.703125, "reward_std": 1.0055135488510132, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.5, "step": 3825 }, { "completion_length": 133.28125, "epoch": 2.0470840021401817, "grad_norm": 0.5332852602005005, "kl": 0.17543095350265503, "learning_rate": 1.3857360701790112e-06, "loss": 0.007, "reward": 1.9757499694824219, "reward_std": 0.8496063947677612, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.44450002908706665, "step": 3826 }, { "completion_length": 95.90625, "epoch": 2.0476190476190474, "grad_norm": 0.4954249858856201, "kl": 0.16468970477581024, "learning_rate": 1.3843429599744467e-06, "loss": 0.0066, "reward": 3.4135000705718994, "reward_std": 0.2446589469909668, "rewards/correctness_reward_func": 1.9375, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.49162501096725464, "step": 3827 }, { "completion_length": 118.375, "epoch": 2.048154093097913, "grad_norm": 0.624765157699585, "kl": 0.18949782848358154, "learning_rate": 1.3829502822199308e-06, "loss": 0.0076, "reward": 2.731187343597412, "reward_std": 0.43888092041015625, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48118749260902405, "step": 3828 }, { "completion_length": 157.4375, "epoch": 2.048689138576779, "grad_norm": 4.178957462310791, "kl": 0.37586289644241333, "learning_rate": 1.3815580374552912e-06, "loss": 0.015, "reward": 1.8387187719345093, "reward_std": 1.1018640995025635, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.3855937421321869, "step": 3829 }, { "completion_length": 146.96875, "epoch": 2.0492241840556447, "grad_norm": 5.564434051513672, "kl": 0.37005311250686646, "learning_rate": 1.3801662262201875e-06, "loss": 0.0148, "reward": 1.227218747138977, "reward_std": 0.5726773142814636, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.41471874713897705, "step": 3830 }, { "completion_length": 125.625, "epoch": 2.0497592295345104, "grad_norm": 0.7770229578018188, "kl": 0.19607457518577576, "learning_rate": 1.3787748490541144e-06, "loss": 0.0078, "reward": 2.1262500286102295, "reward_std": 0.9134269952774048, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4699999988079071, "step": 3831 }, { "completion_length": 121.78125, "epoch": 2.050294275013376, "grad_norm": 0.9959471821784973, "kl": 0.1923200488090515, "learning_rate": 1.3773839064963956e-06, "loss": 0.0077, "reward": 2.5854063034057617, "reward_std": 0.8237021565437317, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.47603124380111694, "step": 3832 }, { "completion_length": 119.90625, "epoch": 2.050829320492242, "grad_norm": 2.2777438163757324, "kl": 0.22442038357257843, "learning_rate": 1.3759933990861857e-06, "loss": 0.009, "reward": 2.0158751010894775, "reward_std": 0.7858301997184753, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.468999981880188, "step": 3833 }, { "completion_length": 143.875, "epoch": 2.0513643659711076, "grad_norm": 1.050652265548706, "kl": 0.19025728106498718, "learning_rate": 1.3746033273624742e-06, "loss": 0.0076, "reward": 1.8981561660766602, "reward_std": 1.0742287635803223, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.4137812554836273, "step": 3834 }, { "completion_length": 145.40625, "epoch": 2.0518994114499733, "grad_norm": 0.8318642377853394, "kl": 0.15556004643440247, "learning_rate": 1.3732136918640787e-06, "loss": 0.0062, "reward": 2.5205001831054688, "reward_std": 0.8627471923828125, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4423750042915344, "step": 3835 }, { "completion_length": 130.34375, "epoch": 2.052434456928839, "grad_norm": 2.670544147491455, "kl": 0.26810067892074585, "learning_rate": 1.371824493129648e-06, "loss": 0.0107, "reward": 1.9793437719345093, "reward_std": 0.8860170245170593, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4480937421321869, "step": 3836 }, { "completion_length": 122.3125, "epoch": 2.052969502407705, "grad_norm": 0.5823332667350769, "kl": 0.19594672322273254, "learning_rate": 1.3704357316976625e-06, "loss": 0.0078, "reward": 2.70703125, "reward_std": 0.6778561472892761, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.48828125, "step": 3837 }, { "completion_length": 115.625, "epoch": 2.0535045478865706, "grad_norm": 0.7049341797828674, "kl": 0.17328619956970215, "learning_rate": 1.369047408106433e-06, "loss": 0.0069, "reward": 2.634718894958496, "reward_std": 0.5693713426589966, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.49409374594688416, "step": 3838 }, { "completion_length": 136.3125, "epoch": 2.054039593365436, "grad_norm": 4304.54248046875, "kl": 23.710546493530273, "learning_rate": 1.3676595228941004e-06, "loss": 0.9484, "reward": 2.0206875801086426, "reward_std": 1.163344383239746, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4425625205039978, "step": 3839 }, { "completion_length": 103.6875, "epoch": 2.0545746388443016, "grad_norm": 1.1249582767486572, "kl": 0.30438753962516785, "learning_rate": 1.3662720765986343e-06, "loss": 0.0122, "reward": 1.8970624208450317, "reward_std": 0.7581169605255127, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4751874804496765, "step": 3840 }, { "completion_length": 148.6875, "epoch": 2.0551096843231673, "grad_norm": 1.7441277503967285, "kl": 0.2169194370508194, "learning_rate": 1.364885069757838e-06, "loss": 0.0087, "reward": 1.98828125, "reward_std": 0.7724401950836182, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.45703125, "step": 3841 }, { "completion_length": 137.75, "epoch": 2.055644729802033, "grad_norm": 0.9306711554527283, "kl": 0.16209623217582703, "learning_rate": 1.363498502909339e-06, "loss": 0.0065, "reward": 2.51171875, "reward_std": 0.9691517949104309, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46484375, "step": 3842 }, { "completion_length": 123.3125, "epoch": 2.056179775280899, "grad_norm": 0.757688045501709, "kl": 0.15394234657287598, "learning_rate": 1.3621123765905986e-06, "loss": 0.0062, "reward": 2.392343759536743, "reward_std": 0.7250649929046631, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.47046875953674316, "step": 3843 }, { "completion_length": 135.1875, "epoch": 2.0567148207597645, "grad_norm": 1.2468091249465942, "kl": 0.1814194619655609, "learning_rate": 1.3607266913389077e-06, "loss": 0.0073, "reward": 1.390625, "reward_std": 0.21778544783592224, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.5, "step": 3844 }, { "completion_length": 120.90625, "epoch": 2.0572498662386303, "grad_norm": 730190.4375, "kl": 315.21673583984375, "learning_rate": 1.3593414476913809e-06, "loss": 12.6087, "reward": 2.0209689140319824, "reward_std": 0.9681253433227539, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4740937352180481, "step": 3845 }, { "completion_length": 132.28125, "epoch": 2.057784911717496, "grad_norm": 1.3037426471710205, "kl": 0.1952209323644638, "learning_rate": 1.357956646184968e-06, "loss": 0.0078, "reward": 1.5111875534057617, "reward_std": 0.4993595778942108, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.49556249380111694, "step": 3846 }, { "completion_length": 142.53125, "epoch": 2.0583199571963617, "grad_norm": 0.5256340503692627, "kl": 0.13199810683727264, "learning_rate": 1.3565722873564433e-06, "loss": 0.0053, "reward": 2.812406301498413, "reward_std": 0.48287564516067505, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4842812418937683, "step": 3847 }, { "completion_length": 132.59375, "epoch": 2.0588550026752275, "grad_norm": 0.6960042715072632, "kl": 0.15033908188343048, "learning_rate": 1.3551883717424102e-06, "loss": 0.006, "reward": 2.71484375, "reward_std": 0.7168547511100769, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48046875, "step": 3848 }, { "completion_length": 121.6875, "epoch": 2.059390048154093, "grad_norm": 1.7419421672821045, "kl": 0.1999111771583557, "learning_rate": 1.353804899879303e-06, "loss": 0.008, "reward": 2.2067813873291016, "reward_std": 0.981584906578064, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.472406268119812, "step": 3849 }, { "completion_length": 139.25, "epoch": 2.059925093632959, "grad_norm": 0.43019479513168335, "kl": 0.13413169980049133, "learning_rate": 1.3524218723033804e-06, "loss": 0.0054, "reward": 2.60546875, "reward_std": 0.7029262185096741, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.46484375, "step": 3850 }, { "completion_length": 130.75, "epoch": 2.0604601391118247, "grad_norm": 0.4841633141040802, "kl": 0.12352112680673599, "learning_rate": 1.3510392895507298e-06, "loss": 0.0049, "reward": 2.1875, "reward_std": 0.8437830209732056, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 3851 }, { "completion_length": 114.53125, "epoch": 2.0609951845906904, "grad_norm": 0.7775639891624451, "kl": 0.2059798687696457, "learning_rate": 1.3496571521572688e-06, "loss": 0.0082, "reward": 2.7534375190734863, "reward_std": 0.7988368272781372, "rewards/correctness_reward_func": 1.5625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.42531251907348633, "step": 3852 }, { "completion_length": 127.5625, "epoch": 2.0615302300695557, "grad_norm": 2.8297979831695557, "kl": 0.1784294992685318, "learning_rate": 1.348275460658739e-06, "loss": 0.0071, "reward": 2.1010000705718994, "reward_std": 0.8916584253311157, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47600001096725464, "step": 3853 }, { "completion_length": 130.875, "epoch": 2.0620652755484215, "grad_norm": 2.608320474624634, "kl": 0.4724910855293274, "learning_rate": 1.3468942155907109e-06, "loss": 0.0189, "reward": 2.2612500190734863, "reward_std": 1.165095567703247, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46437498927116394, "step": 3854 }, { "completion_length": 126.34375, "epoch": 2.062600321027287, "grad_norm": 11.534663200378418, "kl": 0.2546752393245697, "learning_rate": 1.3455134174885804e-06, "loss": 0.0102, "reward": 2.5378124713897705, "reward_std": 0.8893288373947144, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4596875309944153, "step": 3855 }, { "completion_length": 150.09375, "epoch": 2.063135366506153, "grad_norm": 0.5920522809028625, "kl": 0.14232483506202698, "learning_rate": 1.3441330668875729e-06, "loss": 0.0057, "reward": 1.555187463760376, "reward_std": 0.4536007046699524, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.36768749356269836, "step": 3856 }, { "completion_length": 144.59375, "epoch": 2.0636704119850187, "grad_norm": 0.832812488079071, "kl": 0.22697974741458893, "learning_rate": 1.3427531643227382e-06, "loss": 0.0091, "reward": 1.918375015258789, "reward_std": 0.9953397512435913, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4496249854564667, "step": 3857 }, { "completion_length": 155.03125, "epoch": 2.0642054574638844, "grad_norm": 0.6540548801422119, "kl": 0.15786999464035034, "learning_rate": 1.341373710328952e-06, "loss": 0.0063, "reward": 2.220156192779541, "reward_std": 1.0605345964431763, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4232812523841858, "step": 3858 }, { "completion_length": 125.0625, "epoch": 2.06474050294275, "grad_norm": 0.4867629110813141, "kl": 0.14916633069515228, "learning_rate": 1.339994705440919e-06, "loss": 0.006, "reward": 2.224062442779541, "reward_std": 0.480690062046051, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4271875023841858, "step": 3859 }, { "completion_length": 127.65625, "epoch": 2.065275548421616, "grad_norm": 1.0055574178695679, "kl": 0.21985957026481628, "learning_rate": 1.3386161501931668e-06, "loss": 0.0088, "reward": 2.398250102996826, "reward_std": 0.9242821335792542, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.460750013589859, "step": 3860 }, { "completion_length": 141.375, "epoch": 2.0658105939004816, "grad_norm": 0.967059314250946, "kl": 0.1981385350227356, "learning_rate": 1.337238045120049e-06, "loss": 0.0079, "reward": 1.745187520980835, "reward_std": 0.5115034580230713, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.44831252098083496, "step": 3861 }, { "completion_length": 118.6875, "epoch": 2.0663456393793473, "grad_norm": 2.156428337097168, "kl": 0.15423911809921265, "learning_rate": 1.3358603907557487e-06, "loss": 0.0062, "reward": 2.5264687538146973, "reward_std": 0.9945435523986816, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46396875381469727, "step": 3862 }, { "completion_length": 144.15625, "epoch": 2.066880684858213, "grad_norm": 1.3154875040054321, "kl": 0.1377258151769638, "learning_rate": 1.3344831876342667e-06, "loss": 0.0055, "reward": 1.624000072479248, "reward_std": 0.5090678930282593, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.37400001287460327, "step": 3863 }, { "completion_length": 151.71875, "epoch": 2.067415730337079, "grad_norm": 0.8858510851860046, "kl": 0.14820513129234314, "learning_rate": 1.3331064362894363e-06, "loss": 0.0059, "reward": 2.222781181335449, "reward_std": 1.2611662149429321, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.457156240940094, "step": 3864 }, { "completion_length": 131.5625, "epoch": 2.0679507758159446, "grad_norm": 0.5626877546310425, "kl": 0.15502823889255524, "learning_rate": 1.3317301372549125e-06, "loss": 0.0062, "reward": 2.6325626373291016, "reward_std": 0.7992572784423828, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.476312518119812, "step": 3865 }, { "completion_length": 121.90625, "epoch": 2.06848582129481, "grad_norm": 10.01205062866211, "kl": 0.7437334656715393, "learning_rate": 1.3303542910641732e-06, "loss": 0.0297, "reward": 2.5078125, "reward_std": 0.7810142040252686, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4921875, "step": 3866 }, { "completion_length": 118.28125, "epoch": 2.0690208667736756, "grad_norm": 1.4844735860824585, "kl": 0.17190545797348022, "learning_rate": 1.328978898250525e-06, "loss": 0.0069, "reward": 2.1507186889648438, "reward_std": 0.8807872533798218, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4632187485694885, "step": 3867 }, { "completion_length": 137.5625, "epoch": 2.0695559122525413, "grad_norm": 1.2101387977600098, "kl": 0.1642310619354248, "learning_rate": 1.3276039593470957e-06, "loss": 0.0066, "reward": 1.7793124914169312, "reward_std": 0.5691065788269043, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.43556249141693115, "step": 3868 }, { "completion_length": 118.28125, "epoch": 2.070090957731407, "grad_norm": 1.954724907875061, "kl": 0.1499420404434204, "learning_rate": 1.326229474886838e-06, "loss": 0.006, "reward": 2.75, "reward_std": 0.9804592132568359, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 3869 }, { "completion_length": 129.09375, "epoch": 2.070626003210273, "grad_norm": 0.7702617645263672, "kl": 0.16375666856765747, "learning_rate": 1.3248554454025275e-06, "loss": 0.0066, "reward": 2.185187578201294, "reward_std": 0.6049363613128662, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.48206251859664917, "step": 3870 }, { "completion_length": 125.46875, "epoch": 2.0711610486891385, "grad_norm": 1.2241464853286743, "kl": 0.20263753831386566, "learning_rate": 1.323481871426766e-06, "loss": 0.0081, "reward": 2.3646249771118164, "reward_std": 0.6786141395568848, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4115000069141388, "step": 3871 }, { "completion_length": 119.25, "epoch": 2.0716960941680043, "grad_norm": 1.049838662147522, "kl": 0.15450718998908997, "learning_rate": 1.322108753491977e-06, "loss": 0.0062, "reward": 1.9765625, "reward_std": 0.6636115312576294, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4921875, "step": 3872 }, { "completion_length": 139.6875, "epoch": 2.07223113964687, "grad_norm": 0.7493727207183838, "kl": 0.1456281691789627, "learning_rate": 1.3207360921304047e-06, "loss": 0.0058, "reward": 2.339343786239624, "reward_std": 0.9288794994354248, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.43309372663497925, "step": 3873 }, { "completion_length": 125.59375, "epoch": 2.0727661851257357, "grad_norm": 1.3397799730300903, "kl": 0.4071069061756134, "learning_rate": 1.3193638878741221e-06, "loss": 0.0163, "reward": 2.6108436584472656, "reward_std": 0.5621076822280884, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4702187478542328, "step": 3874 }, { "completion_length": 114.5625, "epoch": 2.0733012306046015, "grad_norm": 1.003615379333496, "kl": 0.22216816246509552, "learning_rate": 1.317992141255021e-06, "loss": 0.0089, "reward": 2.70284366607666, "reward_std": 0.602840781211853, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4840937554836273, "step": 3875 }, { "completion_length": 122.9375, "epoch": 2.073836276083467, "grad_norm": 1.132096290588379, "kl": 0.19793498516082764, "learning_rate": 1.316620852804815e-06, "loss": 0.0079, "reward": 2.7187187671661377, "reward_std": 0.6408309936523438, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4687187671661377, "step": 3876 }, { "completion_length": 120.4375, "epoch": 2.074371321562333, "grad_norm": 0.8760308623313904, "kl": 0.2029608190059662, "learning_rate": 1.315250023055044e-06, "loss": 0.0081, "reward": 2.2084686756134033, "reward_std": 0.8787249326705933, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4584687352180481, "step": 3877 }, { "completion_length": 114.09375, "epoch": 2.0749063670411987, "grad_norm": 1.2555068731307983, "kl": 0.2196616232395172, "learning_rate": 1.3138796525370673e-06, "loss": 0.0088, "reward": 1.950124979019165, "reward_std": 0.820253849029541, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4813750088214874, "step": 3878 }, { "completion_length": 124.71875, "epoch": 2.075441412520064, "grad_norm": 2.504409074783325, "kl": 0.2069694697856903, "learning_rate": 1.3125097417820662e-06, "loss": 0.0083, "reward": 1.9906874895095825, "reward_std": 0.8560133576393127, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4594375193119049, "step": 3879 }, { "completion_length": 127.71875, "epoch": 2.0759764579989297, "grad_norm": 1.6306744813919067, "kl": 0.25619077682495117, "learning_rate": 1.3111402913210437e-06, "loss": 0.0102, "reward": 2.345062494277954, "reward_std": 0.9467331767082214, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4075624942779541, "step": 3880 }, { "completion_length": 126.78125, "epoch": 2.0765115034777955, "grad_norm": 0.8608573079109192, "kl": 0.17338213324546814, "learning_rate": 1.309771301684827e-06, "loss": 0.0069, "reward": 2.1545000076293945, "reward_std": 0.8065694570541382, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.42012500762939453, "step": 3881 }, { "completion_length": 104.03125, "epoch": 2.077046548956661, "grad_norm": 1.0505080223083496, "kl": 0.25028711557388306, "learning_rate": 1.3084027734040614e-06, "loss": 0.01, "reward": 2.93331241607666, "reward_std": 0.15809613466262817, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4958125054836273, "step": 3882 }, { "completion_length": 119.78125, "epoch": 2.077581594435527, "grad_norm": 0.8214157819747925, "kl": 0.20449136197566986, "learning_rate": 1.307034707009214e-06, "loss": 0.0082, "reward": 2.520625114440918, "reward_std": 0.9463884830474854, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4581249952316284, "step": 3883 }, { "completion_length": 123.09375, "epoch": 2.0781166399143927, "grad_norm": 1.44376802444458, "kl": 0.36780375242233276, "learning_rate": 1.3056671030305756e-06, "loss": 0.0147, "reward": 1.57421875, "reward_std": 0.3118517994880676, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.49609375, "step": 3884 }, { "completion_length": 133.84375, "epoch": 2.0786516853932584, "grad_norm": 1.1350849866867065, "kl": 0.26998692750930786, "learning_rate": 1.3042999619982547e-06, "loss": 0.0108, "reward": 1.854062557220459, "reward_std": 0.9534916877746582, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4790624976158142, "step": 3885 }, { "completion_length": 125.09375, "epoch": 2.079186730872124, "grad_norm": 3.5714786052703857, "kl": 0.22984063625335693, "learning_rate": 1.3029332844421803e-06, "loss": 0.0092, "reward": 1.9042812585830688, "reward_std": 0.8863129615783691, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46678125858306885, "step": 3886 }, { "completion_length": 143.25, "epoch": 2.07972177635099, "grad_norm": 0.5937566161155701, "kl": 0.2060338407754898, "learning_rate": 1.3015670708921054e-06, "loss": 0.0082, "reward": 1.8504999876022339, "reward_std": 0.33342719078063965, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4598750174045563, "step": 3887 }, { "completion_length": 118.625, "epoch": 2.0802568218298556, "grad_norm": 1.0886896848678589, "kl": 0.19591635465621948, "learning_rate": 1.3002013218775972e-06, "loss": 0.0078, "reward": 1.990593671798706, "reward_std": 0.75566166639328, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.49059373140335083, "step": 3888 }, { "completion_length": 147.9375, "epoch": 2.0807918673087213, "grad_norm": 20.298213958740234, "kl": 0.804950475692749, "learning_rate": 1.298836037928048e-06, "loss": 0.0322, "reward": 1.729812502861023, "reward_std": 0.9597644805908203, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.43293750286102295, "step": 3889 }, { "completion_length": 119.78125, "epoch": 2.081326912787587, "grad_norm": 0.658337414264679, "kl": 0.25489747524261475, "learning_rate": 1.2974712195726696e-06, "loss": 0.0102, "reward": 2.463156223297119, "reward_std": 0.7345324754714966, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46315622329711914, "step": 3890 }, { "completion_length": 132.6875, "epoch": 2.081861958266453, "grad_norm": 2.3213350772857666, "kl": 0.3758712708950043, "learning_rate": 1.2961068673404886e-06, "loss": 0.015, "reward": 1.994937539100647, "reward_std": 0.8769804239273071, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4793125092983246, "step": 3891 }, { "completion_length": 118.4375, "epoch": 2.0823970037453186, "grad_norm": 2.071303129196167, "kl": 0.19750459492206573, "learning_rate": 1.2947429817603563e-06, "loss": 0.0079, "reward": 2.555687427520752, "reward_std": 0.9111536741256714, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47756248712539673, "step": 3892 }, { "completion_length": 134.4375, "epoch": 2.082932049224184, "grad_norm": 1.6581238508224487, "kl": 0.19542279839515686, "learning_rate": 1.2933795633609403e-06, "loss": 0.0078, "reward": 2.16796875, "reward_std": 0.921342670917511, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46484375, "step": 3893 }, { "completion_length": 129.84375, "epoch": 2.0834670947030496, "grad_norm": 0.8340645432472229, "kl": 0.18799051642417908, "learning_rate": 1.2920166126707262e-06, "loss": 0.0075, "reward": 2.1354689598083496, "reward_std": 0.5936353206634521, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4479687511920929, "step": 3894 }, { "completion_length": 143.5625, "epoch": 2.0840021401819153, "grad_norm": 0.7927969098091125, "kl": 0.20309367775917053, "learning_rate": 1.2906541302180223e-06, "loss": 0.0081, "reward": 1.8572187423706055, "reward_std": 0.959377646446228, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.45096874237060547, "step": 3895 }, { "completion_length": 135.125, "epoch": 2.084537185660781, "grad_norm": 0.7945355772972107, "kl": 0.19928252696990967, "learning_rate": 1.2892921165309519e-06, "loss": 0.008, "reward": 2.2645938396453857, "reward_std": 0.4891068637371063, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4520937502384186, "step": 3896 }, { "completion_length": 137.59375, "epoch": 2.085072231139647, "grad_norm": 0.6877912878990173, "kl": 0.17070047557353973, "learning_rate": 1.287930572137457e-06, "loss": 0.0068, "reward": 2.5579686164855957, "reward_std": 1.0346934795379639, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47984373569488525, "step": 3897 }, { "completion_length": 147.25, "epoch": 2.0856072766185125, "grad_norm": 5.328505992889404, "kl": 0.2744205892086029, "learning_rate": 1.2865694975652982e-06, "loss": 0.011, "reward": 1.6265000104904175, "reward_std": 1.0357552766799927, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4077500104904175, "step": 3898 }, { "completion_length": 104.46875, "epoch": 2.0861423220973783, "grad_norm": 1.4263081550598145, "kl": 0.17172729969024658, "learning_rate": 1.2852088933420556e-06, "loss": 0.0069, "reward": 1.890625, "reward_std": 0.6280937194824219, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.5, "step": 3899 }, { "completion_length": 142.875, "epoch": 2.086677367576244, "grad_norm": 1.7972373962402344, "kl": 0.20436087250709534, "learning_rate": 1.2838487599951244e-06, "loss": 0.0082, "reward": 1.6944375038146973, "reward_std": 0.512276291847229, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.41318750381469727, "step": 3900 }, { "completion_length": 138.6875, "epoch": 2.0872124130551097, "grad_norm": 1.5016682147979736, "kl": 0.1426742672920227, "learning_rate": 1.2824890980517173e-06, "loss": 0.0057, "reward": 1.757656216621399, "reward_std": 0.7120281457901001, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4295312464237213, "step": 3901 }, { "completion_length": 145.28125, "epoch": 2.0877474585339755, "grad_norm": 0.6929070949554443, "kl": 0.12378007173538208, "learning_rate": 1.2811299080388678e-06, "loss": 0.005, "reward": 1.8828125, "reward_std": 1.027902603149414, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4453125, "step": 3902 }, { "completion_length": 129.90625, "epoch": 2.088282504012841, "grad_norm": 1.393931269645691, "kl": 0.22931134700775146, "learning_rate": 1.2797711904834226e-06, "loss": 0.0092, "reward": 1.9619686603546143, "reward_std": 1.0761313438415527, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4307187795639038, "step": 3903 }, { "completion_length": 133.0, "epoch": 2.088817549491707, "grad_norm": 24.67824363708496, "kl": 0.3347102105617523, "learning_rate": 1.2784129459120459e-06, "loss": 0.0134, "reward": 1.5648751258850098, "reward_std": 0.913034200668335, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4242499768733978, "step": 3904 }, { "completion_length": 134.0, "epoch": 2.0893525949705727, "grad_norm": 1.2080754041671753, "kl": 0.24489475786685944, "learning_rate": 1.2770551748512211e-06, "loss": 0.0098, "reward": 2.254812479019165, "reward_std": 0.9585667848587036, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47356247901916504, "step": 3905 }, { "completion_length": 148.59375, "epoch": 2.0898876404494384, "grad_norm": 1.4841365814208984, "kl": 0.1766119748353958, "learning_rate": 1.275697877827245e-06, "loss": 0.0071, "reward": 1.9874374866485596, "reward_std": 0.720641553401947, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.45618748664855957, "step": 3906 }, { "completion_length": 130.375, "epoch": 2.0904226859283037, "grad_norm": 0.7916320562362671, "kl": 0.17794910073280334, "learning_rate": 1.2743410553662327e-06, "loss": 0.0071, "reward": 2.3114376068115234, "reward_std": 0.8502887487411499, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4520625174045563, "step": 3907 }, { "completion_length": 116.8125, "epoch": 2.0909577314071695, "grad_norm": 1.2095593214035034, "kl": 0.19313830137252808, "learning_rate": 1.2729847079941126e-06, "loss": 0.0077, "reward": 2.3965625762939453, "reward_std": 0.8539366722106934, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.47468751668930054, "step": 3908 }, { "completion_length": 140.3125, "epoch": 2.091492776886035, "grad_norm": 1.0764607191085815, "kl": 0.15184421837329865, "learning_rate": 1.2716288362366341e-06, "loss": 0.0061, "reward": 1.8710312843322754, "reward_std": 0.43297243118286133, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.464781254529953, "step": 3909 }, { "completion_length": 114.28125, "epoch": 2.092027822364901, "grad_norm": 0.7807376384735107, "kl": 0.21744340658187866, "learning_rate": 1.2702734406193574e-06, "loss": 0.0087, "reward": 2.3159375190734863, "reward_std": 0.9430729150772095, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.45656251907348633, "step": 3910 }, { "completion_length": 135.6875, "epoch": 2.0925628678437667, "grad_norm": 4.498841762542725, "kl": 0.16723358631134033, "learning_rate": 1.2689185216676587e-06, "loss": 0.0067, "reward": 2.3671875, "reward_std": 0.779890239238739, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4765625, "step": 3911 }, { "completion_length": 146.21875, "epoch": 2.0930979133226324, "grad_norm": 1.055546760559082, "kl": 0.13922163844108582, "learning_rate": 1.2675640799067337e-06, "loss": 0.0056, "reward": 2.000906229019165, "reward_std": 1.0664138793945312, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4540312588214874, "step": 3912 }, { "completion_length": 142.125, "epoch": 2.093632958801498, "grad_norm": 1.1764689683914185, "kl": 0.16459408402442932, "learning_rate": 1.266210115861586e-06, "loss": 0.0066, "reward": 2.3359375, "reward_std": 0.8614879846572876, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4765625, "step": 3913 }, { "completion_length": 135.4375, "epoch": 2.094168004280364, "grad_norm": 0.8483695983886719, "kl": 0.17606285214424133, "learning_rate": 1.2648566300570402e-06, "loss": 0.007, "reward": 2.2470312118530273, "reward_std": 0.75971519947052, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46578124165534973, "step": 3914 }, { "completion_length": 133.71875, "epoch": 2.0947030497592296, "grad_norm": 1.1575264930725098, "kl": 0.2016139030456543, "learning_rate": 1.263503623017735e-06, "loss": 0.0081, "reward": 1.3985313177108765, "reward_std": 0.47200125455856323, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4610312581062317, "step": 3915 }, { "completion_length": 117.25, "epoch": 2.0952380952380953, "grad_norm": 0.766748309135437, "kl": 0.26375967264175415, "learning_rate": 1.2621510952681182e-06, "loss": 0.0106, "reward": 2.3125, "reward_std": 0.7223666906356812, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.5, "step": 3916 }, { "completion_length": 132.59375, "epoch": 2.095773140716961, "grad_norm": 0.7152865529060364, "kl": 0.18235114216804504, "learning_rate": 1.2607990473324582e-06, "loss": 0.0073, "reward": 1.8475000858306885, "reward_std": 0.8600932359695435, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4724999964237213, "step": 3917 }, { "completion_length": 102.90625, "epoch": 2.096308186195827, "grad_norm": 1.8444273471832275, "kl": 0.25299501419067383, "learning_rate": 1.2594474797348338e-06, "loss": 0.0101, "reward": 3.035968780517578, "reward_std": 0.7054481506347656, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48909375071525574, "step": 3918 }, { "completion_length": 115.5, "epoch": 2.0968432316746926, "grad_norm": 2.4203481674194336, "kl": 0.23180465400218964, "learning_rate": 1.2580963929991375e-06, "loss": 0.0093, "reward": 2.724249839782715, "reward_std": 0.9534153342247009, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4586249887943268, "step": 3919 }, { "completion_length": 136.1875, "epoch": 2.097378277153558, "grad_norm": 0.8852016925811768, "kl": 0.15386837720870972, "learning_rate": 1.2567457876490785e-06, "loss": 0.0062, "reward": 1.9061249494552612, "reward_std": 0.919306755065918, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.437375009059906, "step": 3920 }, { "completion_length": 158.6875, "epoch": 2.0979133226324236, "grad_norm": 0.655566394329071, "kl": 0.16029182076454163, "learning_rate": 1.255395664208176e-06, "loss": 0.0064, "reward": 1.261875033378601, "reward_std": 0.5561273694038391, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4493750035762787, "step": 3921 }, { "completion_length": 123.625, "epoch": 2.0984483681112893, "grad_norm": 1.0525795221328735, "kl": 0.18665407598018646, "learning_rate": 1.2540460231997631e-06, "loss": 0.0075, "reward": 1.8635001182556152, "reward_std": 0.31025540828704834, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4884999990463257, "step": 3922 }, { "completion_length": 155.71875, "epoch": 2.098983413590155, "grad_norm": 215.7018280029297, "kl": 3.8868746757507324, "learning_rate": 1.2526968651469884e-06, "loss": 0.1555, "reward": 1.0963749885559082, "reward_std": 0.5086325407028198, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4088749885559082, "step": 3923 }, { "completion_length": 134.15625, "epoch": 2.099518459069021, "grad_norm": 1.0827114582061768, "kl": 0.19342780113220215, "learning_rate": 1.25134819057281e-06, "loss": 0.0077, "reward": 2.4498751163482666, "reward_std": 0.7848701477050781, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.44987499713897705, "step": 3924 }, { "completion_length": 123.375, "epoch": 2.1000535045478865, "grad_norm": 1.1838265657424927, "kl": 0.3063839077949524, "learning_rate": 1.2500000000000007e-06, "loss": 0.0123, "reward": 2.291781187057495, "reward_std": 0.9635066390037537, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4792812466621399, "step": 3925 }, { "completion_length": 146.84375, "epoch": 2.1005885500267523, "grad_norm": 1.8723771572113037, "kl": 0.1882225126028061, "learning_rate": 1.2486522939511433e-06, "loss": 0.0075, "reward": 1.6328437328338623, "reward_std": 0.5757766366004944, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4140937626361847, "step": 3926 }, { "completion_length": 140.75, "epoch": 2.101123595505618, "grad_norm": 1.074163794517517, "kl": 0.19034238159656525, "learning_rate": 1.2473050729486372e-06, "loss": 0.0076, "reward": 1.8759686946868896, "reward_std": 1.1243641376495361, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4540937542915344, "step": 3927 }, { "completion_length": 136.53125, "epoch": 2.1016586409844837, "grad_norm": 1.4626160860061646, "kl": 0.19328638911247253, "learning_rate": 1.2459583375146898e-06, "loss": 0.0077, "reward": 1.7999062538146973, "reward_std": 0.5695862174034119, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47178125381469727, "step": 3928 }, { "completion_length": 142.75, "epoch": 2.1021936864633495, "grad_norm": 0.8950883150100708, "kl": 0.14545273780822754, "learning_rate": 1.2446120881713204e-06, "loss": 0.0058, "reward": 2.0361249446868896, "reward_std": 0.8119289875030518, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4423750042915344, "step": 3929 }, { "completion_length": 108.09375, "epoch": 2.102728731942215, "grad_norm": 1.1606701612472534, "kl": 0.21449396014213562, "learning_rate": 1.2432663254403637e-06, "loss": 0.0086, "reward": 2.98075008392334, "reward_std": 0.9679579734802246, "rewards/correctness_reward_func": 1.5625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4807499945163727, "step": 3930 }, { "completion_length": 128.96875, "epoch": 2.103263777421081, "grad_norm": 6.990646839141846, "kl": 0.31897205114364624, "learning_rate": 1.24192104984346e-06, "loss": 0.0128, "reward": 1.8074063062667847, "reward_std": 0.8240072727203369, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4792812764644623, "step": 3931 }, { "completion_length": 154.15625, "epoch": 2.1037988228999467, "grad_norm": 2.2473514080047607, "kl": 0.1542176753282547, "learning_rate": 1.2405762619020655e-06, "loss": 0.0062, "reward": 1.7409999370574951, "reward_std": 0.48502883315086365, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.3816249966621399, "step": 3932 }, { "completion_length": 143.96875, "epoch": 2.104333868378812, "grad_norm": 1.043958306312561, "kl": 0.18027573823928833, "learning_rate": 1.2392319621374476e-06, "loss": 0.0072, "reward": 2.0799062252044678, "reward_std": 0.544491708278656, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47053125500679016, "step": 3933 }, { "completion_length": 124.53125, "epoch": 2.1048689138576777, "grad_norm": 1.9236652851104736, "kl": 0.15853223204612732, "learning_rate": 1.237888151070679e-06, "loss": 0.0063, "reward": 2.6192188262939453, "reward_std": 0.4602813422679901, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47859376668930054, "step": 3934 }, { "completion_length": 111.65625, "epoch": 2.1054039593365435, "grad_norm": 1.4221522808074951, "kl": 0.3876894414424896, "learning_rate": 1.236544829222649e-06, "loss": 0.0155, "reward": 2.37918758392334, "reward_std": 1.0545594692230225, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4729374647140503, "step": 3935 }, { "completion_length": 142.46875, "epoch": 2.105939004815409, "grad_norm": 2.1979966163635254, "kl": 0.18528257310390472, "learning_rate": 1.2352019971140545e-06, "loss": 0.0074, "reward": 1.241781234741211, "reward_std": 0.577716588973999, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4449062645435333, "step": 3936 }, { "completion_length": 129.28125, "epoch": 2.106474050294275, "grad_norm": 1.5438941717147827, "kl": 0.2252657115459442, "learning_rate": 1.2338596552654017e-06, "loss": 0.009, "reward": 1.9614686965942383, "reward_std": 0.7778937220573425, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.47709375619888306, "step": 3937 }, { "completion_length": 134.0, "epoch": 2.1070090957731407, "grad_norm": 1.4676008224487305, "kl": 0.16660073399543762, "learning_rate": 1.2325178041970102e-06, "loss": 0.0067, "reward": 1.5428438186645508, "reward_std": 0.6154108643531799, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.433468759059906, "step": 3938 }, { "completion_length": 119.84375, "epoch": 2.1075441412520064, "grad_norm": 272.7956237792969, "kl": 2.624404191970825, "learning_rate": 1.2311764444290062e-06, "loss": 0.105, "reward": 1.75, "reward_std": 0.6101624965667725, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.5, "step": 3939 }, { "completion_length": 134.15625, "epoch": 2.108079186730872, "grad_norm": 1.8546013832092285, "kl": 0.35514453053474426, "learning_rate": 1.229835576481326e-06, "loss": 0.0142, "reward": 2.0560312271118164, "reward_std": 0.9011075496673584, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.4154062569141388, "step": 3940 }, { "completion_length": 147.875, "epoch": 2.108614232209738, "grad_norm": 0.4865957200527191, "kl": 0.1895194798707962, "learning_rate": 1.2284952008737153e-06, "loss": 0.0076, "reward": 1.1704062223434448, "reward_std": 0.5658398270606995, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4047812521457672, "step": 3941 }, { "completion_length": 143.375, "epoch": 2.1091492776886036, "grad_norm": 1.2205501794815063, "kl": 0.20198829472064972, "learning_rate": 1.2271553181257306e-06, "loss": 0.0081, "reward": 1.96484375, "reward_std": 0.9603904485702515, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.44921875, "step": 3942 }, { "completion_length": 151.84375, "epoch": 2.1096843231674693, "grad_norm": 2.49967885017395, "kl": 0.36493146419525146, "learning_rate": 1.2258159287567362e-06, "loss": 0.0146, "reward": 1.429593801498413, "reward_std": 0.8794623017311096, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.21875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4139687418937683, "step": 3943 }, { "completion_length": 123.9375, "epoch": 2.110219368646335, "grad_norm": 0.76216059923172, "kl": 0.19707761704921722, "learning_rate": 1.2244770332859032e-06, "loss": 0.0079, "reward": 2.5509376525878906, "reward_std": 0.6128417253494263, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4884375035762787, "step": 3944 }, { "completion_length": 147.125, "epoch": 2.110754414125201, "grad_norm": 0.588129460811615, "kl": 0.14020682871341705, "learning_rate": 1.2231386322322155e-06, "loss": 0.0056, "reward": 1.6567187309265137, "reward_std": 0.5665715336799622, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.43796873092651367, "step": 3945 }, { "completion_length": 138.28125, "epoch": 2.1112894596040666, "grad_norm": 2.589073419570923, "kl": 0.19608430564403534, "learning_rate": 1.221800726114462e-06, "loss": 0.0078, "reward": 2.208218812942505, "reward_std": 1.174086093902588, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4425937533378601, "step": 3946 }, { "completion_length": 120.65625, "epoch": 2.111824505082932, "grad_norm": 0.8884743452072144, "kl": 0.21525484323501587, "learning_rate": 1.2204633154512397e-06, "loss": 0.0086, "reward": 1.8160312175750732, "reward_std": 0.7282400727272034, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48790624737739563, "step": 3947 }, { "completion_length": 124.9375, "epoch": 2.1123595505617976, "grad_norm": 0.8692162036895752, "kl": 0.15843690931797028, "learning_rate": 1.2191264007609566e-06, "loss": 0.0063, "reward": 2.5703125, "reward_std": 0.5958166718482971, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4921875, "step": 3948 }, { "completion_length": 109.90625, "epoch": 2.1128945960406633, "grad_norm": 2.9369373321533203, "kl": 0.31076931953430176, "learning_rate": 1.2177899825618252e-06, "loss": 0.0124, "reward": 2.564812421798706, "reward_std": 1.1198530197143555, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4710625112056732, "step": 3949 }, { "completion_length": 137.6875, "epoch": 2.113429641519529, "grad_norm": 1.0543487071990967, "kl": 0.1765930950641632, "learning_rate": 1.2164540613718676e-06, "loss": 0.0071, "reward": 1.6421250104904175, "reward_std": 0.690067708492279, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4702500104904175, "step": 3950 }, { "completion_length": 151.25, "epoch": 2.113964686998395, "grad_norm": 1.8666222095489502, "kl": 0.4225238859653473, "learning_rate": 1.2151186377089104e-06, "loss": 0.0169, "reward": 1.5068750381469727, "reward_std": 0.4270632266998291, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.38187500834465027, "step": 3951 }, { "completion_length": 132.625, "epoch": 2.1144997324772605, "grad_norm": 1.877424955368042, "kl": 0.1994045376777649, "learning_rate": 1.213783712090592e-06, "loss": 0.008, "reward": 2.2713124752044678, "reward_std": 0.8734596967697144, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4431874752044678, "step": 3952 }, { "completion_length": 135.125, "epoch": 2.1150347779561263, "grad_norm": 1.7502238750457764, "kl": 0.16355086863040924, "learning_rate": 1.2124492850343542e-06, "loss": 0.0065, "reward": 1.6326251029968262, "reward_std": 0.45892441272735596, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4919999837875366, "step": 3953 }, { "completion_length": 101.625, "epoch": 2.115569823434992, "grad_norm": 2.2437796592712402, "kl": 0.28461143374443054, "learning_rate": 1.2111153570574454e-06, "loss": 0.0114, "reward": 2.5413436889648438, "reward_std": 0.5230071544647217, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4944687485694885, "step": 3954 }, { "completion_length": 113.53125, "epoch": 2.1161048689138577, "grad_norm": 0.4985327124595642, "kl": 0.16971391439437866, "learning_rate": 1.2097819286769237e-06, "loss": 0.0068, "reward": 2.71875, "reward_std": 0.33109182119369507, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.5, "step": 3955 }, { "completion_length": 124.84375, "epoch": 2.1166399143927235, "grad_norm": 2.4589593410491943, "kl": 0.46630415320396423, "learning_rate": 1.20844900040965e-06, "loss": 0.0187, "reward": 1.8355937004089355, "reward_std": 0.2852233648300171, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47621873021125793, "step": 3956 }, { "completion_length": 127.375, "epoch": 2.117174959871589, "grad_norm": 1.7056427001953125, "kl": 0.19581608474254608, "learning_rate": 1.2071165727722925e-06, "loss": 0.0078, "reward": 1.710249900817871, "reward_std": 0.493884414434433, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.42899999022483826, "step": 3957 }, { "completion_length": 145.0625, "epoch": 2.117710005350455, "grad_norm": 0.8677793145179749, "kl": 0.16283932328224182, "learning_rate": 1.2057846462813284e-06, "loss": 0.0065, "reward": 1.7734375, "reward_std": 0.7996863126754761, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4765625, "step": 3958 }, { "completion_length": 150.09375, "epoch": 2.1182450508293207, "grad_norm": 0.8491061329841614, "kl": 0.14205466210842133, "learning_rate": 1.2044532214530338e-06, "loss": 0.0057, "reward": 1.920875072479248, "reward_std": 0.8775756359100342, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.42087501287460327, "step": 3959 }, { "completion_length": 119.9375, "epoch": 2.1187800963081864, "grad_norm": 2.2960987091064453, "kl": 0.23190338909626007, "learning_rate": 1.203122298803497e-06, "loss": 0.0093, "reward": 2.606468677520752, "reward_std": 0.9057864546775818, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48146873712539673, "step": 3960 }, { "completion_length": 154.25, "epoch": 2.1193151417870517, "grad_norm": 0.8461146354675293, "kl": 0.17254407703876495, "learning_rate": 1.2017918788486102e-06, "loss": 0.0069, "reward": 1.7297186851501465, "reward_std": 0.9836122393608093, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.44846874475479126, "step": 3961 }, { "completion_length": 140.375, "epoch": 2.1198501872659175, "grad_norm": 2.457305431365967, "kl": 0.1875084489583969, "learning_rate": 1.2004619621040667e-06, "loss": 0.0075, "reward": 1.8700624704360962, "reward_std": 0.8643339276313782, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4481875002384186, "step": 3962 }, { "completion_length": 120.90625, "epoch": 2.120385232744783, "grad_norm": 1.3538683652877808, "kl": 0.18516115844249725, "learning_rate": 1.19913254908537e-06, "loss": 0.0074, "reward": 2.02734375, "reward_std": 0.7818320989608765, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46484375, "step": 3963 }, { "completion_length": 122.65625, "epoch": 2.120920278223649, "grad_norm": 1.2906877994537354, "kl": 0.22423307597637177, "learning_rate": 1.1978036403078252e-06, "loss": 0.009, "reward": 2.591156244277954, "reward_std": 0.7387478947639465, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4817812442779541, "step": 3964 }, { "completion_length": 116.03125, "epoch": 2.1214553237025147, "grad_norm": 0.7063959836959839, "kl": 0.18020373582839966, "learning_rate": 1.196475236286542e-06, "loss": 0.0072, "reward": 2.437375068664551, "reward_std": 0.8621271848678589, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.499875009059906, "step": 3965 }, { "completion_length": 122.15625, "epoch": 2.1219903691813804, "grad_norm": 458.654296875, "kl": 2.5844764709472656, "learning_rate": 1.1951473375364373e-06, "loss": 0.1034, "reward": 2.1875, "reward_std": 0.987643301486969, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.5, "step": 3966 }, { "completion_length": 119.28125, "epoch": 2.122525414660246, "grad_norm": 1.0434216260910034, "kl": 0.23784321546554565, "learning_rate": 1.1938199445722288e-06, "loss": 0.0095, "reward": 2.554562568664551, "reward_std": 0.802132248878479, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.476437509059906, "step": 3967 }, { "completion_length": 128.21875, "epoch": 2.123060460139112, "grad_norm": 4.874111175537109, "kl": 0.7879018783569336, "learning_rate": 1.1924930579084396e-06, "loss": 0.0315, "reward": 2.1932499408721924, "reward_std": 0.44453731179237366, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.45887500047683716, "step": 3968 }, { "completion_length": 134.0625, "epoch": 2.1235955056179776, "grad_norm": 210607424.0, "kl": 541383.0625, "learning_rate": 1.1911666780593956e-06, "loss": 21655.3262, "reward": 2.576937437057495, "reward_std": 0.6462362408638, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4675624966621399, "step": 3969 }, { "completion_length": 123.90625, "epoch": 2.1241305510968433, "grad_norm": 2.0636789798736572, "kl": 0.2300269901752472, "learning_rate": 1.1898408055392285e-06, "loss": 0.0092, "reward": 1.8368749618530273, "reward_std": 0.6105487942695618, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47749999165534973, "step": 3970 }, { "completion_length": 135.96875, "epoch": 2.124665596575709, "grad_norm": 1.749659538269043, "kl": 0.24319833517074585, "learning_rate": 1.1885154408618715e-06, "loss": 0.0097, "reward": 1.4893124103546143, "reward_std": 1.0133994817733765, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4424374997615814, "step": 3971 }, { "completion_length": 124.90625, "epoch": 2.125200642054575, "grad_norm": 2.013002395629883, "kl": 0.1583133339881897, "learning_rate": 1.1871905845410603e-06, "loss": 0.0063, "reward": 2.4339375495910645, "reward_std": 0.9988167881965637, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4651874899864197, "step": 3972 }, { "completion_length": 125.0625, "epoch": 2.1257356875334406, "grad_norm": 0.8560447692871094, "kl": 0.17675676941871643, "learning_rate": 1.185866237090336e-06, "loss": 0.0071, "reward": 2.765625, "reward_std": 0.48454001545906067, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 3973 }, { "completion_length": 114.1875, "epoch": 2.126270733012306, "grad_norm": 0.8640762567520142, "kl": 0.18686380982398987, "learning_rate": 1.1845423990230406e-06, "loss": 0.0075, "reward": 2.718625068664551, "reward_std": 0.7796874642372131, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.499875009059906, "step": 3974 }, { "completion_length": 121.71875, "epoch": 2.1268057784911716, "grad_norm": 1.5660958290100098, "kl": 0.2260764241218567, "learning_rate": 1.1832190708523181e-06, "loss": 0.009, "reward": 2.48828125, "reward_std": 0.9587959051132202, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48828125, "step": 3975 }, { "completion_length": 139.90625, "epoch": 2.1273408239700373, "grad_norm": 0.7404438853263855, "kl": 0.18754678964614868, "learning_rate": 1.1818962530911182e-06, "loss": 0.0075, "reward": 2.1425623893737793, "reward_std": 0.8866649866104126, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47068750858306885, "step": 3976 }, { "completion_length": 142.0, "epoch": 2.127875869448903, "grad_norm": 1.3317497968673706, "kl": 0.18531042337417603, "learning_rate": 1.1805739462521872e-06, "loss": 0.0074, "reward": 1.9054999351501465, "reward_std": 0.8392578363418579, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46799999475479126, "step": 3977 }, { "completion_length": 124.03125, "epoch": 2.128410914927769, "grad_norm": 4.392162799835205, "kl": 0.2850051820278168, "learning_rate": 1.1792521508480794e-06, "loss": 0.0114, "reward": 2.5766873359680176, "reward_std": 0.5939918756484985, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4829374849796295, "step": 3978 }, { "completion_length": 117.15625, "epoch": 2.1289459604066345, "grad_norm": 3.4026007652282715, "kl": 0.20133984088897705, "learning_rate": 1.177930867391147e-06, "loss": 0.0081, "reward": 2.1991875171661377, "reward_std": 0.7812237739562988, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4491875171661377, "step": 3979 }, { "completion_length": 135.1875, "epoch": 2.1294810058855003, "grad_norm": 0.9857909679412842, "kl": 0.17675432562828064, "learning_rate": 1.1766100963935434e-06, "loss": 0.0071, "reward": 2.047593593597412, "reward_std": 1.0968496799468994, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46946874260902405, "step": 3980 }, { "completion_length": 114.15625, "epoch": 2.130016051364366, "grad_norm": 0.8013331890106201, "kl": 0.19095921516418457, "learning_rate": 1.1752898383672273e-06, "loss": 0.0076, "reward": 2.110687494277954, "reward_std": 0.8086545467376709, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4700624942779541, "step": 3981 }, { "completion_length": 115.6875, "epoch": 2.1305510968432317, "grad_norm": 0.5943166613578796, "kl": 0.15570537745952606, "learning_rate": 1.1739700938239546e-06, "loss": 0.0062, "reward": 2.64453125, "reward_std": 0.863371729850769, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48828125, "step": 3982 }, { "completion_length": 120.5625, "epoch": 2.1310861423220975, "grad_norm": 0.839411735534668, "kl": 0.17018458247184753, "learning_rate": 1.1726508632752836e-06, "loss": 0.0068, "reward": 2.230968713760376, "reward_std": 0.8187887668609619, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48096874356269836, "step": 3983 }, { "completion_length": 140.875, "epoch": 2.131621187800963, "grad_norm": 0.9376932382583618, "kl": 0.162225604057312, "learning_rate": 1.1713321472325727e-06, "loss": 0.0065, "reward": 2.4453125, "reward_std": 0.8942520618438721, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4765625, "step": 3984 }, { "completion_length": 117.0, "epoch": 2.132156233279829, "grad_norm": 0.818389356136322, "kl": 0.16803574562072754, "learning_rate": 1.1700139462069835e-06, "loss": 0.0067, "reward": 2.910343647003174, "reward_std": 0.20997408032417297, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.472843736410141, "step": 3985 }, { "completion_length": 126.03125, "epoch": 2.1326912787586947, "grad_norm": 1.5377633571624756, "kl": 0.17460010945796967, "learning_rate": 1.1686962607094743e-06, "loss": 0.007, "reward": 1.6308125257492065, "reward_std": 0.8079559803009033, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.45893749594688416, "step": 3986 }, { "completion_length": 119.0, "epoch": 2.13322632423756, "grad_norm": 1.1444035768508911, "kl": 0.25324490666389465, "learning_rate": 1.167379091250805e-06, "loss": 0.0101, "reward": 2.1544063091278076, "reward_std": 0.8349233865737915, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48253124952316284, "step": 3987 }, { "completion_length": 128.96875, "epoch": 2.1337613697164257, "grad_norm": 1.614444375038147, "kl": 0.29740089178085327, "learning_rate": 1.1660624383415373e-06, "loss": 0.0119, "reward": 2.490593671798706, "reward_std": 0.7384319305419922, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4593437612056732, "step": 3988 }, { "completion_length": 134.21875, "epoch": 2.1342964151952915, "grad_norm": 1.958706021308899, "kl": 0.18317796289920807, "learning_rate": 1.1647463024920303e-06, "loss": 0.0073, "reward": 1.8005000352859497, "reward_std": 0.9276899695396423, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4411250054836273, "step": 3989 }, { "completion_length": 109.28125, "epoch": 2.134831460674157, "grad_norm": 0.9527906775474548, "kl": 0.1897239238023758, "learning_rate": 1.1634306842124423e-06, "loss": 0.0076, "reward": 3.03125, "reward_std": 0.4658970534801483, "rewards/correctness_reward_func": 1.5625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.5, "step": 3990 }, { "completion_length": 133.28125, "epoch": 2.135366506153023, "grad_norm": 1.0447523593902588, "kl": 0.16345490515232086, "learning_rate": 1.1621155840127343e-06, "loss": 0.0065, "reward": 2.37890625, "reward_std": 0.8099836111068726, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48828125, "step": 3991 }, { "completion_length": 133.125, "epoch": 2.1359015516318887, "grad_norm": 1.3630555868148804, "kl": 0.17953824996948242, "learning_rate": 1.1608010024026634e-06, "loss": 0.0072, "reward": 2.440718650817871, "reward_std": 0.5492069125175476, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.40946877002716064, "step": 3992 }, { "completion_length": 123.6875, "epoch": 2.1364365971107544, "grad_norm": 3.1832759380340576, "kl": 0.20673391222953796, "learning_rate": 1.1594869398917855e-06, "loss": 0.0083, "reward": 2.1256561279296875, "reward_std": 0.9462953209877014, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.42253124713897705, "step": 3993 }, { "completion_length": 116.9375, "epoch": 2.13697164258962, "grad_norm": 0.9618066549301147, "kl": 0.18720224499702454, "learning_rate": 1.1581733969894582e-06, "loss": 0.0075, "reward": 1.5567188262939453, "reward_std": 0.42713749408721924, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.49421876668930054, "step": 3994 }, { "completion_length": 146.6875, "epoch": 2.137506688068486, "grad_norm": 0.9673972129821777, "kl": 0.14706459641456604, "learning_rate": 1.1568603742048354e-06, "loss": 0.0059, "reward": 2.0215625762939453, "reward_std": 0.8534022569656372, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.41218748688697815, "step": 3995 }, { "completion_length": 127.5, "epoch": 2.1380417335473516, "grad_norm": 1.9351623058319092, "kl": 0.29802966117858887, "learning_rate": 1.1555478720468697e-06, "loss": 0.0119, "reward": 1.9993125200271606, "reward_std": 0.6092594861984253, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48368749022483826, "step": 3996 }, { "completion_length": 139.875, "epoch": 2.1385767790262173, "grad_norm": 1.0787469148635864, "kl": 0.1691788136959076, "learning_rate": 1.1542358910243107e-06, "loss": 0.0068, "reward": 1.871343731880188, "reward_std": 0.7757211923599243, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.449468731880188, "step": 3997 }, { "completion_length": 134.4375, "epoch": 2.139111824505083, "grad_norm": 102.9714126586914, "kl": 0.6025916934013367, "learning_rate": 1.1529244316457097e-06, "loss": 0.0241, "reward": 1.7805312871932983, "reward_std": 0.7277839183807373, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46803125739097595, "step": 3998 }, { "completion_length": 115.6875, "epoch": 2.139646869983949, "grad_norm": 7.340562343597412, "kl": 0.2673647999763489, "learning_rate": 1.1516134944194128e-06, "loss": 0.0107, "reward": 2.1190624237060547, "reward_std": 0.8181071281433105, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.47843751311302185, "step": 3999 }, { "completion_length": 129.5625, "epoch": 2.140181915462814, "grad_norm": 0.7391693592071533, "kl": 0.18841159343719482, "learning_rate": 1.1503030798535628e-06, "loss": 0.0075, "reward": 1.9000625610351562, "reward_std": 0.5141209959983826, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.4938125014305115, "step": 4000 }, { "completion_length": 111.0625, "epoch": 2.14071696094168, "grad_norm": 1.4706133604049683, "kl": 0.3111632466316223, "learning_rate": 1.1489931884561048e-06, "loss": 0.0124, "reward": 2.124875068664551, "reward_std": 0.7166091799736023, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.499875009059906, "step": 4001 }, { "completion_length": 126.59375, "epoch": 2.1412520064205456, "grad_norm": 1.0187596082687378, "kl": 0.1989298313856125, "learning_rate": 1.147683820734774e-06, "loss": 0.008, "reward": 2.0859999656677246, "reward_std": 0.7015864849090576, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4766250252723694, "step": 4002 }, { "completion_length": 126.5, "epoch": 2.1417870518994113, "grad_norm": 0.4263535439968109, "kl": 0.14347977936267853, "learning_rate": 1.146374977197108e-06, "loss": 0.0057, "reward": 2.1015625, "reward_std": 0.6187013387680054, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4921875, "step": 4003 }, { "completion_length": 133.90625, "epoch": 2.142322097378277, "grad_norm": 1.2822405099868774, "kl": 0.20728278160095215, "learning_rate": 1.1450666583504417e-06, "loss": 0.0083, "reward": 2.8286561965942383, "reward_std": 1.115075945854187, "rewards/correctness_reward_func": 1.5625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.45365625619888306, "step": 4004 }, { "completion_length": 124.875, "epoch": 2.142857142857143, "grad_norm": 1.2507998943328857, "kl": 0.242354154586792, "learning_rate": 1.143758864701901e-06, "loss": 0.0097, "reward": 1.5102187395095825, "reward_std": 0.6772959232330322, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4320937395095825, "step": 4005 }, { "completion_length": 145.8125, "epoch": 2.1433921883360085, "grad_norm": 0.8726209998130798, "kl": 0.17741155624389648, "learning_rate": 1.1424515967584143e-06, "loss": 0.0071, "reward": 1.1909375190734863, "reward_std": 0.35592561960220337, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.40968748927116394, "step": 4006 }, { "completion_length": 106.34375, "epoch": 2.1439272338148743, "grad_norm": 0.718591034412384, "kl": 0.18220636248588562, "learning_rate": 1.1411448550267029e-06, "loss": 0.0073, "reward": 2.7577500343322754, "reward_std": 0.3447922468185425, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.476500004529953, "step": 4007 }, { "completion_length": 120.28125, "epoch": 2.14446227929374, "grad_norm": 1.4809948205947876, "kl": 0.23254287242889404, "learning_rate": 1.1398386400132839e-06, "loss": 0.0093, "reward": 2.3041250705718994, "reward_std": 0.5773229002952576, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.49162501096725464, "step": 4008 }, { "completion_length": 142.9375, "epoch": 2.1449973247726057, "grad_norm": 0.5308072566986084, "kl": 0.17143034934997559, "learning_rate": 1.1385329522244731e-06, "loss": 0.0069, "reward": 1.8379062414169312, "reward_std": 0.6737940907478333, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.44728124141693115, "step": 4009 }, { "completion_length": 132.0, "epoch": 2.1455323702514715, "grad_norm": 0.7037320733070374, "kl": 0.22491353750228882, "learning_rate": 1.1372277921663796e-06, "loss": 0.009, "reward": 2.005906105041504, "reward_std": 0.7191998362541199, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47465625405311584, "step": 4010 }, { "completion_length": 151.6875, "epoch": 2.146067415730337, "grad_norm": 2.361217975616455, "kl": 0.16119693219661713, "learning_rate": 1.1359231603449077e-06, "loss": 0.0064, "reward": 1.550624966621399, "reward_std": 0.9538334608078003, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4256249964237213, "step": 4011 }, { "completion_length": 126.40625, "epoch": 2.146602461209203, "grad_norm": 2.950083017349243, "kl": 0.4098181426525116, "learning_rate": 1.1346190572657575e-06, "loss": 0.0164, "reward": 2.1175625324249268, "reward_std": 0.9355717897415161, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.445687472820282, "step": 4012 }, { "completion_length": 153.59375, "epoch": 2.1471375066880687, "grad_norm": 2.7591989040374756, "kl": 0.2172633409500122, "learning_rate": 1.133315483434426e-06, "loss": 0.0087, "reward": 1.7186250686645508, "reward_std": 0.7407819032669067, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.437375009059906, "step": 4013 }, { "completion_length": 137.375, "epoch": 2.1476725521669344, "grad_norm": 1.3632487058639526, "kl": 0.24745629727840424, "learning_rate": 1.1320124393562024e-06, "loss": 0.0099, "reward": 2.234375, "reward_std": 0.7345927953720093, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.5, "step": 4014 }, { "completion_length": 117.5625, "epoch": 2.1482075976457997, "grad_norm": 0.5272190570831299, "kl": 0.22419513761997223, "learning_rate": 1.1307099255361703e-06, "loss": 0.009, "reward": 2.8540937900543213, "reward_std": 0.7833365797996521, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4790937304496765, "step": 4015 }, { "completion_length": 131.15625, "epoch": 2.1487426431246655, "grad_norm": 1.119584560394287, "kl": 0.23059329390525818, "learning_rate": 1.1294079424792112e-06, "loss": 0.0092, "reward": 2.023218870162964, "reward_std": 0.7336331605911255, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4763437509536743, "step": 4016 }, { "completion_length": 121.0625, "epoch": 2.149277688603531, "grad_norm": 0.6822390556335449, "kl": 0.2091403305530548, "learning_rate": 1.128106490689998e-06, "loss": 0.0084, "reward": 2.046875, "reward_std": 0.5702300071716309, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.5, "step": 4017 }, { "completion_length": 123.21875, "epoch": 2.149812734082397, "grad_norm": 1.1548740863800049, "kl": 0.21964508295059204, "learning_rate": 1.126805570672997e-06, "loss": 0.0088, "reward": 2.256624937057495, "reward_std": 0.986464262008667, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4753749966621399, "step": 4018 }, { "completion_length": 121.90625, "epoch": 2.1503477795612627, "grad_norm": 38.67277908325195, "kl": 4.8127946853637695, "learning_rate": 1.1255051829324715e-06, "loss": 0.1925, "reward": 2.013906240463257, "reward_std": 0.8388653993606567, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48265624046325684, "step": 4019 }, { "completion_length": 120.125, "epoch": 2.1508828250401284, "grad_norm": 3.386669158935547, "kl": 0.376693457365036, "learning_rate": 1.1242053279724763e-06, "loss": 0.0151, "reward": 2.585812568664551, "reward_std": 0.6736419200897217, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.476437509059906, "step": 4020 }, { "completion_length": 136.15625, "epoch": 2.151417870518994, "grad_norm": 1.4442901611328125, "kl": 0.2026159167289734, "learning_rate": 1.1229060062968597e-06, "loss": 0.0081, "reward": 2.029750108718872, "reward_std": 0.9410998821258545, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4516249895095825, "step": 4021 }, { "completion_length": 122.5625, "epoch": 2.15195291599786, "grad_norm": 2.434479236602783, "kl": 0.21997758746147156, "learning_rate": 1.1216072184092627e-06, "loss": 0.0088, "reward": 2.050187587738037, "reward_std": 0.9717894792556763, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.45643749833106995, "step": 4022 }, { "completion_length": 140.46875, "epoch": 2.1524879614767256, "grad_norm": 0.7811168432235718, "kl": 0.181159108877182, "learning_rate": 1.1203089648131226e-06, "loss": 0.0072, "reward": 1.6014375686645508, "reward_std": 0.5354476571083069, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.476437509059906, "step": 4023 }, { "completion_length": 106.78125, "epoch": 2.1530230069555913, "grad_norm": 2.5670952796936035, "kl": 0.3986191153526306, "learning_rate": 1.1190112460116661e-06, "loss": 0.0159, "reward": 2.349375009536743, "reward_std": 0.842013955116272, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.49000000953674316, "step": 4024 }, { "completion_length": 113.90625, "epoch": 2.153558052434457, "grad_norm": 0.6755005717277527, "kl": 0.23073384165763855, "learning_rate": 1.1177140625079132e-06, "loss": 0.0092, "reward": 2.0280938148498535, "reward_std": 0.6017480492591858, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48121872544288635, "step": 4025 }, { "completion_length": 135.625, "epoch": 2.154093097913323, "grad_norm": 6.160092830657959, "kl": 0.30450475215911865, "learning_rate": 1.1164174148046788e-06, "loss": 0.0122, "reward": 2.48856258392334, "reward_std": 0.783912181854248, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4885624945163727, "step": 4026 }, { "completion_length": 139.4375, "epoch": 2.1546281433921886, "grad_norm": 4.247799873352051, "kl": 0.1694808155298233, "learning_rate": 1.1151213034045675e-06, "loss": 0.0068, "reward": 1.4842500686645508, "reward_std": 0.6561777591705322, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.453000009059906, "step": 4027 }, { "completion_length": 127.625, "epoch": 2.155163188871054, "grad_norm": 1.5364606380462646, "kl": 0.17912901937961578, "learning_rate": 1.1138257288099757e-06, "loss": 0.0072, "reward": 2.0802500247955322, "reward_std": 0.7153259515762329, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.45524999499320984, "step": 4028 }, { "completion_length": 128.71875, "epoch": 2.1556982343499196, "grad_norm": 0.42354705929756165, "kl": 0.21120525896549225, "learning_rate": 1.1125306915230962e-06, "loss": 0.0084, "reward": 2.015625, "reward_std": 0.49645745754241943, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 4029 }, { "completion_length": 149.75, "epoch": 2.1562332798287853, "grad_norm": 2.14841890335083, "kl": 0.2099878340959549, "learning_rate": 1.1112361920459065e-06, "loss": 0.0084, "reward": 1.9541250467300415, "reward_std": 0.7975926399230957, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.42287498712539673, "step": 4030 }, { "completion_length": 116.53125, "epoch": 2.156768325307651, "grad_norm": 1.5240099430084229, "kl": 0.23291251063346863, "learning_rate": 1.1099422308801816e-06, "loss": 0.0093, "reward": 2.4017813205718994, "reward_std": 0.27780458331108093, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.49553126096725464, "step": 4031 }, { "completion_length": 102.3125, "epoch": 2.157303370786517, "grad_norm": 69266.4765625, "kl": 164.96798706054688, "learning_rate": 1.1086488085274854e-06, "loss": 6.5987, "reward": 2.644406318664551, "reward_std": 0.6329784393310547, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.488156259059906, "step": 4032 }, { "completion_length": 132.84375, "epoch": 2.1578384162653825, "grad_norm": 0.6940500140190125, "kl": 0.16329628229141235, "learning_rate": 1.1073559254891722e-06, "loss": 0.0065, "reward": 2.14453125, "reward_std": 1.0896992683410645, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.45703125, "step": 4033 }, { "completion_length": 140.40625, "epoch": 2.1583734617442483, "grad_norm": 2.9000370502471924, "kl": 0.2072046995162964, "learning_rate": 1.1060635822663894e-06, "loss": 0.0083, "reward": 1.7775312662124634, "reward_std": 0.4085090458393097, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.449406236410141, "step": 4034 }, { "completion_length": 122.34375, "epoch": 2.158908507223114, "grad_norm": 3.3791637420654297, "kl": 0.21670877933502197, "learning_rate": 1.1047717793600743e-06, "loss": 0.0087, "reward": 2.4716873168945312, "reward_std": 0.8503068685531616, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4560624957084656, "step": 4035 }, { "completion_length": 139.15625, "epoch": 2.1594435527019797, "grad_norm": 0.7526344656944275, "kl": 0.1498657912015915, "learning_rate": 1.1034805172709523e-06, "loss": 0.006, "reward": 2.140625, "reward_std": 0.9269080758094788, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46875, "step": 4036 }, { "completion_length": 127.375, "epoch": 2.1599785981808455, "grad_norm": 0.7774639129638672, "kl": 0.22013844549655914, "learning_rate": 1.1021897964995437e-06, "loss": 0.0088, "reward": 2.6119375228881836, "reward_std": 0.7647464275360107, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4244374930858612, "step": 4037 }, { "completion_length": 124.5625, "epoch": 2.160513643659711, "grad_norm": 794.7966918945312, "kl": 4.325260639190674, "learning_rate": 1.100899617546156e-06, "loss": 0.173, "reward": 2.4606873989105225, "reward_std": 0.9253076314926147, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.476312518119812, "step": 4038 }, { "completion_length": 123.9375, "epoch": 2.161048689138577, "grad_norm": 1.8819770812988281, "kl": 0.20747987926006317, "learning_rate": 1.0996099809108868e-06, "loss": 0.0083, "reward": 1.8235937356948853, "reward_std": 0.9186736345291138, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46421873569488525, "step": 4039 }, { "completion_length": 108.3125, "epoch": 2.1615837346174427, "grad_norm": 1.6027780771255493, "kl": 0.17962610721588135, "learning_rate": 1.098320887093623e-06, "loss": 0.0072, "reward": 2.2490625381469727, "reward_std": 1.1356518268585205, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48343750834465027, "step": 4040 }, { "completion_length": 123.09375, "epoch": 2.162118780096308, "grad_norm": 1.2917033433914185, "kl": 0.27236688137054443, "learning_rate": 1.0970323365940443e-06, "loss": 0.0109, "reward": 2.7109375, "reward_std": 0.5460367202758789, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4921875, "step": 4041 }, { "completion_length": 127.40625, "epoch": 2.1626538255751737, "grad_norm": 1.2275055646896362, "kl": 0.22644849121570587, "learning_rate": 1.0957443299116164e-06, "loss": 0.0091, "reward": 1.3416249752044678, "reward_std": 0.41715216636657715, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48225000500679016, "step": 4042 }, { "completion_length": 115.34375, "epoch": 2.1631888710540395, "grad_norm": 1.3527402877807617, "kl": 0.2539542317390442, "learning_rate": 1.0944568675455947e-06, "loss": 0.0102, "reward": 1.826968789100647, "reward_std": 0.712425947189331, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4519687294960022, "step": 4043 }, { "completion_length": 121.8125, "epoch": 2.163723916532905, "grad_norm": 0.6805012226104736, "kl": 0.21586857736110687, "learning_rate": 1.0931699499950267e-06, "loss": 0.0086, "reward": 2.4541563987731934, "reward_std": 0.8704938292503357, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4697812497615814, "step": 4044 }, { "completion_length": 150.0, "epoch": 2.164258962011771, "grad_norm": 1.7325432300567627, "kl": 0.15381412208080292, "learning_rate": 1.0918835777587428e-06, "loss": 0.0062, "reward": 1.2161874771118164, "reward_std": 0.7088398933410645, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4193124771118164, "step": 4045 }, { "completion_length": 130.46875, "epoch": 2.1647940074906367, "grad_norm": 2.3961281776428223, "kl": 0.3330814838409424, "learning_rate": 1.0905977513353672e-06, "loss": 0.0133, "reward": 2.2950310707092285, "reward_std": 0.9060744047164917, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.48253124952316284, "step": 4046 }, { "completion_length": 125.3125, "epoch": 2.1653290529695024, "grad_norm": 0.9015173316001892, "kl": 0.24814686179161072, "learning_rate": 1.089312471223313e-06, "loss": 0.0099, "reward": 1.9824999570846558, "reward_std": 0.7142833471298218, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.45125001668930054, "step": 4047 }, { "completion_length": 110.875, "epoch": 2.165864098448368, "grad_norm": 1.113820195198059, "kl": 0.23356588184833527, "learning_rate": 1.0880277379207758e-06, "loss": 0.0093, "reward": 2.364093780517578, "reward_std": 0.31289762258529663, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47346875071525574, "step": 4048 }, { "completion_length": 121.0, "epoch": 2.166399143927234, "grad_norm": 1.6620066165924072, "kl": 0.2663443088531494, "learning_rate": 1.086743551925745e-06, "loss": 0.0107, "reward": 1.7143125534057617, "reward_std": 0.6802492141723633, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47993749380111694, "step": 4049 }, { "completion_length": 110.75, "epoch": 2.1669341894060996, "grad_norm": 1.2802973985671997, "kl": 0.28430652618408203, "learning_rate": 1.0854599137359954e-06, "loss": 0.0114, "reward": 2.083343744277954, "reward_std": 0.6402485370635986, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4739687442779541, "step": 4050 }, { "completion_length": 144.25, "epoch": 2.1674692348849653, "grad_norm": 1.171201229095459, "kl": 0.15284182131290436, "learning_rate": 1.0841768238490883e-06, "loss": 0.0061, "reward": 1.4755938053131104, "reward_std": 0.7494052648544312, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4130937457084656, "step": 4051 }, { "completion_length": 142.59375, "epoch": 2.168004280363831, "grad_norm": 1.783808946609497, "kl": 0.17390486598014832, "learning_rate": 1.0828942827623765e-06, "loss": 0.007, "reward": 1.558687448501587, "reward_std": 0.8136403560638428, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4649375081062317, "step": 4052 }, { "completion_length": 129.6875, "epoch": 2.168539325842697, "grad_norm": 0.5860246419906616, "kl": 0.15405413508415222, "learning_rate": 1.0816122909729957e-06, "loss": 0.0062, "reward": 2.239281177520752, "reward_std": 0.6819233894348145, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.47365623712539673, "step": 4053 }, { "completion_length": 129.78125, "epoch": 2.169074371321562, "grad_norm": 0.9707515239715576, "kl": 0.2181837558746338, "learning_rate": 1.0803308489778713e-06, "loss": 0.0087, "reward": 2.120187520980835, "reward_std": 0.8906615972518921, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4639374613761902, "step": 4054 }, { "completion_length": 116.4375, "epoch": 2.169609416800428, "grad_norm": 31.12590980529785, "kl": 0.9223551750183105, "learning_rate": 1.0790499572737134e-06, "loss": 0.0369, "reward": 3.0078125, "reward_std": 0.49834999442100525, "rewards/correctness_reward_func": 1.5625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4921875, "step": 4055 }, { "completion_length": 111.1875, "epoch": 2.1701444622792936, "grad_norm": 1.2626398801803589, "kl": 0.19299165904521942, "learning_rate": 1.077769616357022e-06, "loss": 0.0077, "reward": 2.7256875038146973, "reward_std": 1.2008914947509766, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.46006250381469727, "step": 4056 }, { "completion_length": 133.46875, "epoch": 2.1706795077581593, "grad_norm": 0.8284915089607239, "kl": 0.15800830721855164, "learning_rate": 1.076489826724081e-06, "loss": 0.0063, "reward": 2.1015625, "reward_std": 0.7390690445899963, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4765625, "step": 4057 }, { "completion_length": 131.125, "epoch": 2.171214553237025, "grad_norm": 1.5287553071975708, "kl": 0.18435870110988617, "learning_rate": 1.0752105888709605e-06, "loss": 0.0074, "reward": 2.4760937690734863, "reward_std": 0.7606183290481567, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46046873927116394, "step": 4058 }, { "completion_length": 129.5625, "epoch": 2.171749598715891, "grad_norm": 0.8866892457008362, "kl": 0.1653069257736206, "learning_rate": 1.0739319032935192e-06, "loss": 0.0066, "reward": 2.76743745803833, "reward_std": 1.0909039974212646, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48618751764297485, "step": 4059 }, { "completion_length": 136.90625, "epoch": 2.1722846441947565, "grad_norm": 1.0522665977478027, "kl": 0.17757420241832733, "learning_rate": 1.0726537704873994e-06, "loss": 0.0071, "reward": 2.066281318664551, "reward_std": 1.0465466976165771, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.488156259059906, "step": 4060 }, { "completion_length": 137.75, "epoch": 2.1728196896736223, "grad_norm": 1.0669904947280884, "kl": 0.19761180877685547, "learning_rate": 1.0713761909480288e-06, "loss": 0.0079, "reward": 2.2187187671661377, "reward_std": 1.1196904182434082, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4530937671661377, "step": 4061 }, { "completion_length": 128.6875, "epoch": 2.173354735152488, "grad_norm": 1.6297637224197388, "kl": 0.29356303811073303, "learning_rate": 1.0700991651706242e-06, "loss": 0.0117, "reward": 1.746500015258789, "reward_std": 0.424041748046875, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.44962501525878906, "step": 4062 }, { "completion_length": 135.9375, "epoch": 2.1738897806313537, "grad_norm": 1.0522165298461914, "kl": 0.17371973395347595, "learning_rate": 1.0688226936501832e-06, "loss": 0.0069, "reward": 1.7721562385559082, "reward_std": 0.47550010681152344, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4440312683582306, "step": 4063 }, { "completion_length": 122.5, "epoch": 2.1744248261102195, "grad_norm": 0.6482115387916565, "kl": 0.14514651894569397, "learning_rate": 1.0675467768814907e-06, "loss": 0.0058, "reward": 2.7111563682556152, "reward_std": 0.9454267621040344, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4767812490463257, "step": 4064 }, { "completion_length": 142.5, "epoch": 2.174959871589085, "grad_norm": 0.977232813835144, "kl": 0.18102961778640747, "learning_rate": 1.0662714153591179e-06, "loss": 0.0072, "reward": 1.572812557220459, "reward_std": 0.6406163573265076, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4321874976158142, "step": 4065 }, { "completion_length": 126.78125, "epoch": 2.175494917067951, "grad_norm": 1.7577561140060425, "kl": 0.2815268933773041, "learning_rate": 1.0649966095774184e-06, "loss": 0.0113, "reward": 2.137406349182129, "reward_std": 0.737950325012207, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.43428122997283936, "step": 4066 }, { "completion_length": 134.96875, "epoch": 2.1760299625468167, "grad_norm": 2.485454797744751, "kl": 0.3553552031517029, "learning_rate": 1.0637223600305314e-06, "loss": 0.0142, "reward": 1.541968822479248, "reward_std": 0.6801400184631348, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.43259376287460327, "step": 4067 }, { "completion_length": 146.875, "epoch": 2.176565008025682, "grad_norm": 2.276304006576538, "kl": 0.2324807196855545, "learning_rate": 1.0624486672123796e-06, "loss": 0.0093, "reward": 1.6223125457763672, "reward_std": 0.6478222608566284, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4191874861717224, "step": 4068 }, { "completion_length": 127.375, "epoch": 2.1771000535045477, "grad_norm": 0.855736494064331, "kl": 0.16338005661964417, "learning_rate": 1.0611755316166728e-06, "loss": 0.0065, "reward": 2.5777812004089355, "reward_std": 1.1065036058425903, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4840312600135803, "step": 4069 }, { "completion_length": 129.5625, "epoch": 2.1776350989834135, "grad_norm": 1.0395740270614624, "kl": 0.18245771527290344, "learning_rate": 1.0599029537369013e-06, "loss": 0.0073, "reward": 2.451218605041504, "reward_std": 0.9501359462738037, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.48246875405311584, "step": 4070 }, { "completion_length": 136.75, "epoch": 2.178170144462279, "grad_norm": 1.622559666633606, "kl": 0.18159043788909912, "learning_rate": 1.0586309340663401e-06, "loss": 0.0073, "reward": 2.2679061889648438, "reward_std": 1.1243795156478882, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4397812485694885, "step": 4071 }, { "completion_length": 123.96875, "epoch": 2.178705189941145, "grad_norm": 0.9754989743232727, "kl": 0.30611294507980347, "learning_rate": 1.0573594730980511e-06, "loss": 0.0122, "reward": 1.9131873846054077, "reward_std": 0.612377941608429, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.46006250381469727, "step": 4072 }, { "completion_length": 148.46875, "epoch": 2.1792402354200107, "grad_norm": 1.358919382095337, "kl": 0.2630089819431305, "learning_rate": 1.056088571324873e-06, "loss": 0.0105, "reward": 1.6365000009536743, "reward_std": 0.9097342491149902, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4177500009536743, "step": 4073 }, { "completion_length": 151.0, "epoch": 2.1797752808988764, "grad_norm": 1.4797812700271606, "kl": 0.20727255940437317, "learning_rate": 1.054818229239434e-06, "loss": 0.0083, "reward": 1.666968822479248, "reward_std": 0.5417487621307373, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4482187330722809, "step": 4074 }, { "completion_length": 110.125, "epoch": 2.180310326377742, "grad_norm": 1.3445302248001099, "kl": 0.3365313410758972, "learning_rate": 1.0535484473341448e-06, "loss": 0.0135, "reward": 1.8517186641693115, "reward_std": 0.7175238132476807, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4923437535762787, "step": 4075 }, { "completion_length": 127.75, "epoch": 2.180845371856608, "grad_norm": 0.9873226881027222, "kl": 0.17097297310829163, "learning_rate": 1.0522792261011932e-06, "loss": 0.0068, "reward": 1.9484062194824219, "reward_std": 0.8312851190567017, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.40153124928474426, "step": 4076 }, { "completion_length": 137.1875, "epoch": 2.1813804173354736, "grad_norm": 0.9982686638832092, "kl": 0.2867920398712158, "learning_rate": 1.0510105660325567e-06, "loss": 0.0115, "reward": 1.9830000400543213, "reward_std": 1.283815622329712, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.4517500102519989, "step": 4077 }, { "completion_length": 128.71875, "epoch": 2.1819154628143393, "grad_norm": 0.928501307964325, "kl": 0.17348447442054749, "learning_rate": 1.0497424676199911e-06, "loss": 0.0069, "reward": 2.4252185821533203, "reward_std": 1.1441739797592163, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.44084376096725464, "step": 4078 }, { "completion_length": 119.65625, "epoch": 2.182450508293205, "grad_norm": 1.7267346382141113, "kl": 0.15321850776672363, "learning_rate": 1.048474931355035e-06, "loss": 0.0061, "reward": 1.9550312757492065, "reward_std": 0.9231882095336914, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47065624594688416, "step": 4079 }, { "completion_length": 146.4375, "epoch": 2.182985553772071, "grad_norm": 0.8509852290153503, "kl": 0.17122524976730347, "learning_rate": 1.0472079577290113e-06, "loss": 0.0068, "reward": 1.6550312042236328, "reward_std": 0.6851941347122192, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4519062638282776, "step": 4080 }, { "completion_length": 148.65625, "epoch": 2.1835205992509366, "grad_norm": 0.9293726086616516, "kl": 0.21466520428657532, "learning_rate": 1.0459415472330226e-06, "loss": 0.0086, "reward": 1.5740000009536743, "reward_std": 1.1843682527542114, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.4021250009536743, "step": 4081 }, { "completion_length": 129.125, "epoch": 2.184055644729802, "grad_norm": 1.1271145343780518, "kl": 0.2489163875579834, "learning_rate": 1.0446757003579537e-06, "loss": 0.01, "reward": 1.625906229019165, "reward_std": 0.8365093469619751, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.46965622901916504, "step": 4082 }, { "completion_length": 143.9375, "epoch": 2.1845906902086676, "grad_norm": 0.8427342772483826, "kl": 0.2039877474308014, "learning_rate": 1.0434104175944702e-06, "loss": 0.0082, "reward": 2.1190311908721924, "reward_std": 1.0051724910736084, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.44715625047683716, "step": 4083 }, { "completion_length": 111.0, "epoch": 2.1851257356875333, "grad_norm": 1.0451356172561646, "kl": 0.2995593547821045, "learning_rate": 1.0421456994330217e-06, "loss": 0.012, "reward": 2.40625, "reward_std": 0.6075299978256226, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.5, "step": 4084 }, { "completion_length": 111.09375, "epoch": 2.185660781166399, "grad_norm": 1.9730948209762573, "kl": 0.2779870629310608, "learning_rate": 1.0408815463638367e-06, "loss": 0.0111, "reward": 2.296875, "reward_std": 0.8889582753181458, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.5, "step": 4085 }, { "completion_length": 132.28125, "epoch": 2.186195826645265, "grad_norm": 0.7731685638427734, "kl": 0.23003482818603516, "learning_rate": 1.0396179588769238e-06, "loss": 0.0092, "reward": 1.4647812843322754, "reward_std": 0.5962507724761963, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4491562247276306, "step": 4086 }, { "completion_length": 117.84375, "epoch": 2.1867308721241305, "grad_norm": 1.0449138879776, "kl": 0.24365171790122986, "learning_rate": 1.0383549374620755e-06, "loss": 0.0097, "reward": 2.22446870803833, "reward_std": 0.8026995062828064, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47446876764297485, "step": 4087 }, { "completion_length": 141.34375, "epoch": 2.1872659176029963, "grad_norm": 0.6424282789230347, "kl": 0.155878946185112, "learning_rate": 1.0370924826088628e-06, "loss": 0.0062, "reward": 1.3902812004089355, "reward_std": 0.852204442024231, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.39028123021125793, "step": 4088 }, { "completion_length": 143.625, "epoch": 2.187800963081862, "grad_norm": 22.777299880981445, "kl": 1.2613993883132935, "learning_rate": 1.0358305948066355e-06, "loss": 0.0505, "reward": 1.5767812728881836, "reward_std": 0.9235238432884216, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4361562430858612, "step": 4089 }, { "completion_length": 137.6875, "epoch": 2.1883360085607277, "grad_norm": 4217270528.0, "kl": 181333984.0, "learning_rate": 1.0345692745445293e-06, "loss": 7253359.5, "reward": 2.1033124923706055, "reward_std": 0.697624921798706, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.43143749237060547, "step": 4090 }, { "completion_length": 123.875, "epoch": 2.1888710540395935, "grad_norm": 0.9507137537002563, "kl": 0.17333021759986877, "learning_rate": 1.0333085223114516e-06, "loss": 0.0069, "reward": 2.2576560974121094, "reward_std": 0.7144203782081604, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4764062464237213, "step": 4091 }, { "completion_length": 152.125, "epoch": 2.189406099518459, "grad_norm": 1.3103007078170776, "kl": 0.2737300395965576, "learning_rate": 1.0320483385960976e-06, "loss": 0.0109, "reward": 1.3716249465942383, "reward_std": 0.7634322643280029, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.43412500619888306, "step": 4092 }, { "completion_length": 140.71875, "epoch": 2.189941144997325, "grad_norm": 1.6630234718322754, "kl": 0.19977641105651855, "learning_rate": 1.0307887238869369e-06, "loss": 0.008, "reward": 1.3865312337875366, "reward_std": 0.9080235958099365, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.3709062337875366, "step": 4093 }, { "completion_length": 120.8125, "epoch": 2.1904761904761907, "grad_norm": 0.5857306122779846, "kl": 0.1696917712688446, "learning_rate": 1.02952967867222e-06, "loss": 0.0068, "reward": 2.65625, "reward_std": 0.8245437145233154, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.5, "step": 4094 }, { "completion_length": 128.03125, "epoch": 2.191011235955056, "grad_norm": 2.6501777172088623, "kl": 0.24588577449321747, "learning_rate": 1.0282712034399786e-06, "loss": 0.0098, "reward": 1.519937515258789, "reward_std": 0.7648061513900757, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.45743751525878906, "step": 4095 }, { "completion_length": 130.34375, "epoch": 2.1915462814339217, "grad_norm": 1.2854111194610596, "kl": 0.20914630591869354, "learning_rate": 1.027013298678021e-06, "loss": 0.0084, "reward": 2.293375015258789, "reward_std": 1.1662015914916992, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.44962501525878906, "step": 4096 }, { "completion_length": 138.5, "epoch": 2.1920813269127875, "grad_norm": 1.0689140558242798, "kl": 0.19638529419898987, "learning_rate": 1.0257559648739342e-06, "loss": 0.0079, "reward": 1.56431245803833, "reward_std": 0.8733956813812256, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.43931251764297485, "step": 4097 }, { "completion_length": 117.34375, "epoch": 2.192616372391653, "grad_norm": 0.7791805863380432, "kl": 0.1940675675868988, "learning_rate": 1.0244992025150866e-06, "loss": 0.0078, "reward": 2.234375, "reward_std": 0.7978519201278687, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.5, "step": 4098 }, { "completion_length": 129.25, "epoch": 2.193151417870519, "grad_norm": 1.8982363939285278, "kl": 0.21190069615840912, "learning_rate": 1.0232430120886227e-06, "loss": 0.0085, "reward": 2.451812505722046, "reward_std": 0.8508399724960327, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4674375057220459, "step": 4099 }, { "completion_length": 121.0625, "epoch": 2.1936864633493847, "grad_norm": 0.9871771931648254, "kl": 0.20588946342468262, "learning_rate": 1.0219873940814656e-06, "loss": 0.0082, "reward": 2.020750045776367, "reward_std": 0.6364282965660095, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4738749861717224, "step": 4100 }, { "completion_length": 150.875, "epoch": 2.1942215088282504, "grad_norm": 1.1680773496627808, "kl": 0.15326325595378876, "learning_rate": 1.0207323489803164e-06, "loss": 0.0061, "reward": 1.4952499866485596, "reward_std": 0.5642858743667603, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.43274998664855957, "step": 4101 }, { "completion_length": 149.1875, "epoch": 2.194756554307116, "grad_norm": 0.8193992376327515, "kl": 0.14410927891731262, "learning_rate": 1.0194778772716562e-06, "loss": 0.0058, "reward": 1.9277499914169312, "reward_std": 0.7364753484725952, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.45899999141693115, "step": 4102 }, { "completion_length": 130.09375, "epoch": 2.195291599785982, "grad_norm": 1.9816248416900635, "kl": 0.47278594970703125, "learning_rate": 1.0182239794417418e-06, "loss": 0.0189, "reward": 1.6880624294281006, "reward_std": 0.7296777963638306, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.45368748903274536, "step": 4103 }, { "completion_length": 138.1875, "epoch": 2.1958266452648476, "grad_norm": 1.0348401069641113, "kl": 0.2322755753993988, "learning_rate": 1.0169706559766063e-06, "loss": 0.0093, "reward": 2.1883437633514404, "reward_std": 1.1077746152877808, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46959376335144043, "step": 4104 }, { "completion_length": 105.5, "epoch": 2.1963616907437133, "grad_norm": 1.6679153442382812, "kl": 0.23653903603553772, "learning_rate": 1.0157179073620644e-06, "loss": 0.0095, "reward": 2.484375, "reward_std": 0.4960165321826935, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.5, "step": 4105 }, { "completion_length": 110.25, "epoch": 2.196896736222579, "grad_norm": 0.3743348717689514, "kl": 0.12745428085327148, "learning_rate": 1.0144657340837042e-06, "loss": 0.0051, "reward": 2.8984375, "reward_std": 0.4918332099914551, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.4921875, "step": 4106 }, { "completion_length": 119.28125, "epoch": 2.197431781701445, "grad_norm": 1.2653883695602417, "kl": 0.22961556911468506, "learning_rate": 1.0132141366268913e-06, "loss": 0.0092, "reward": 2.428500175476074, "reward_std": 0.9223881363868713, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4753749966621399, "step": 4107 }, { "completion_length": 127.71875, "epoch": 2.19796682718031, "grad_norm": 2.635939836502075, "kl": 0.17910756170749664, "learning_rate": 1.0119631154767706e-06, "loss": 0.0072, "reward": 2.3824374675750732, "reward_std": 0.825882077217102, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47618749737739563, "step": 4108 }, { "completion_length": 131.28125, "epoch": 2.198501872659176, "grad_norm": 1.3848568201065063, "kl": 0.3257240653038025, "learning_rate": 1.0107126711182613e-06, "loss": 0.013, "reward": 1.4420937299728394, "reward_std": 0.7232113480567932, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.44209375977516174, "step": 4109 }, { "completion_length": 139.90625, "epoch": 2.1990369181380416, "grad_norm": 2.8332388401031494, "kl": 0.21507489681243896, "learning_rate": 1.009462804036059e-06, "loss": 0.0086, "reward": 1.625, "reward_std": 0.855508029460907, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.453125, "step": 4110 }, { "completion_length": 138.5625, "epoch": 2.1995719636169073, "grad_norm": 2.3034844398498535, "kl": 0.3537929654121399, "learning_rate": 1.0082135147146358e-06, "loss": 0.0142, "reward": 2.031937599182129, "reward_std": 1.1815159320831299, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.42256250977516174, "step": 4111 }, { "completion_length": 141.90625, "epoch": 2.200107009095773, "grad_norm": 1.4035016298294067, "kl": 0.16170436143875122, "learning_rate": 1.0069648036382414e-06, "loss": 0.0065, "reward": 1.9739999771118164, "reward_std": 0.6642743349075317, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4739999771118164, "step": 4112 }, { "completion_length": 113.71875, "epoch": 2.200642054574639, "grad_norm": 1.1510673761367798, "kl": 0.34906283020973206, "learning_rate": 1.0057166712908997e-06, "loss": 0.014, "reward": 2.6793124675750732, "reward_std": 0.5332452058792114, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.44493749737739563, "step": 4113 }, { "completion_length": 113.25, "epoch": 2.2011771000535045, "grad_norm": 1.4805711507797241, "kl": 0.19629055261611938, "learning_rate": 1.0044691181564098e-06, "loss": 0.0079, "reward": 2.59765625, "reward_std": 0.7321277260780334, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.48828125, "step": 4114 }, { "completion_length": 131.0, "epoch": 2.2017121455323703, "grad_norm": 0.9153544902801514, "kl": 0.15456074476242065, "learning_rate": 1.0032221447183496e-06, "loss": 0.0062, "reward": 2.049093723297119, "reward_std": 0.666454553604126, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4553437829017639, "step": 4115 }, { "completion_length": 166.875, "epoch": 2.202247191011236, "grad_norm": 2.6134696006774902, "kl": 0.15665952861309052, "learning_rate": 1.0019757514600667e-06, "loss": 0.0063, "reward": 1.495687484741211, "reward_std": 1.1083292961120605, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.3394375145435333, "step": 4116 }, { "completion_length": 140.3125, "epoch": 2.2027822364901017, "grad_norm": 0.5940472483634949, "kl": 0.1495160609483719, "learning_rate": 1.0007299388646885e-06, "loss": 0.006, "reward": 2.2548751831054688, "reward_std": 0.7186115384101868, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4423750042915344, "step": 4117 }, { "completion_length": 128.875, "epoch": 2.2033172819689675, "grad_norm": 0.989734411239624, "kl": 0.1758221685886383, "learning_rate": 9.99484707415118e-07, "loss": 0.007, "reward": 2.307187557220459, "reward_std": 0.928050696849823, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4946874976158142, "step": 4118 }, { "completion_length": 131.5625, "epoch": 2.203852327447833, "grad_norm": 0.6463892459869385, "kl": 0.17084160447120667, "learning_rate": 9.982400575940263e-07, "loss": 0.0068, "reward": 1.9124374389648438, "reward_std": 0.5023441314697266, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4749374985694885, "step": 4119 }, { "completion_length": 139.28125, "epoch": 2.204387372926699, "grad_norm": 1.6467530727386475, "kl": 0.1764356791973114, "learning_rate": 9.969959898838671e-07, "loss": 0.0071, "reward": 2.3633124828338623, "reward_std": 1.043006420135498, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4570625126361847, "step": 4120 }, { "completion_length": 131.75, "epoch": 2.2049224184055642, "grad_norm": 0.679015040397644, "kl": 0.24701642990112305, "learning_rate": 9.95752504766864e-07, "loss": 0.0099, "reward": 1.615125060081482, "reward_std": 0.90533447265625, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.41200000047683716, "step": 4121 }, { "completion_length": 128.34375, "epoch": 2.20545746388443, "grad_norm": 1.422930121421814, "kl": 0.17098423838615417, "learning_rate": 9.94509602725014e-07, "loss": 0.0068, "reward": 2.0335938930511475, "reward_std": 0.6282857656478882, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4554687440395355, "step": 4122 }, { "completion_length": 110.03125, "epoch": 2.2059925093632957, "grad_norm": 1.7113275527954102, "kl": 0.19765537977218628, "learning_rate": 9.932672842400923e-07, "loss": 0.0079, "reward": 2.5775625705718994, "reward_std": 0.5944216251373291, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.49943751096725464, "step": 4123 }, { "completion_length": 145.21875, "epoch": 2.2065275548421615, "grad_norm": 1.0248668193817139, "kl": 0.1832120418548584, "learning_rate": 9.92025549793644e-07, "loss": 0.0073, "reward": 1.7132500410079956, "reward_std": 0.5249786376953125, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4476250112056732, "step": 4124 }, { "completion_length": 114.125, "epoch": 2.207062600321027, "grad_norm": 1.0941742658615112, "kl": 0.18236830830574036, "learning_rate": 9.907843998669894e-07, "loss": 0.0073, "reward": 2.7775001525878906, "reward_std": 0.4017157554626465, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4806250035762787, "step": 4125 }, { "completion_length": 160.125, "epoch": 2.207597645799893, "grad_norm": 2.1704962253570557, "kl": 0.2561439275741577, "learning_rate": 9.89543834941221e-07, "loss": 0.0102, "reward": 1.8782812356948853, "reward_std": 0.5351890921592712, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.45640623569488525, "step": 4126 }, { "completion_length": 130.90625, "epoch": 2.2081326912787587, "grad_norm": 0.6606960296630859, "kl": 0.17765012383460999, "learning_rate": 9.883038554972077e-07, "loss": 0.0071, "reward": 2.2374062538146973, "reward_std": 0.8130899667739868, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.44053125381469727, "step": 4127 }, { "completion_length": 138.65625, "epoch": 2.2086677367576244, "grad_norm": 1.0788453817367554, "kl": 0.21002966165542603, "learning_rate": 9.870644620155878e-07, "loss": 0.0084, "reward": 2.0645625591278076, "reward_std": 0.7841805219650269, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47081249952316284, "step": 4128 }, { "completion_length": 118.375, "epoch": 2.20920278223649, "grad_norm": 2.0763676166534424, "kl": 0.22300241887569427, "learning_rate": 9.85825654976773e-07, "loss": 0.0089, "reward": 2.0468125343322754, "reward_std": 0.35389673709869385, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.499937504529953, "step": 4129 }, { "completion_length": 145.65625, "epoch": 2.209737827715356, "grad_norm": 0.7739852070808411, "kl": 0.163121297955513, "learning_rate": 9.845874348609514e-07, "loss": 0.0065, "reward": 1.6752500534057617, "reward_std": 0.8628238439559937, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.47212499380111694, "step": 4130 }, { "completion_length": 121.0625, "epoch": 2.2102728731942216, "grad_norm": 0.8974207043647766, "kl": 0.2692740261554718, "learning_rate": 9.83349802148079e-07, "loss": 0.0108, "reward": 1.8502812385559082, "reward_std": 0.847541868686676, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4440312683582306, "step": 4131 }, { "completion_length": 137.375, "epoch": 2.2108079186730873, "grad_norm": 0.967795729637146, "kl": 0.2935766577720642, "learning_rate": 9.821127573178852e-07, "loss": 0.0117, "reward": 1.69140625, "reward_std": 0.7137049436569214, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48828125, "step": 4132 }, { "completion_length": 110.625, "epoch": 2.211342964151953, "grad_norm": 1.9166077375411987, "kl": 0.3254605531692505, "learning_rate": 9.808763008498743e-07, "loss": 0.013, "reward": 2.196906328201294, "reward_std": 1.1644506454467773, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.47815626859664917, "step": 4133 }, { "completion_length": 137.78125, "epoch": 2.211878009630819, "grad_norm": 0.6129865050315857, "kl": 0.13020047545433044, "learning_rate": 9.796404332233197e-07, "loss": 0.0052, "reward": 1.7381561994552612, "reward_std": 0.8405874371528625, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.425656259059906, "step": 4134 }, { "completion_length": 146.15625, "epoch": 2.2124130551096846, "grad_norm": 1.321482539176941, "kl": 0.2128235250711441, "learning_rate": 9.784051549172663e-07, "loss": 0.0085, "reward": 2.0152812004089355, "reward_std": 0.7348021268844604, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4840312600135803, "step": 4135 }, { "completion_length": 152.0625, "epoch": 2.21294810058855, "grad_norm": 3.0116026401519775, "kl": 0.2599410116672516, "learning_rate": 9.771704664105335e-07, "loss": 0.0104, "reward": 1.909000039100647, "reward_std": 1.20870041847229, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4090000092983246, "step": 4136 }, { "completion_length": 117.71875, "epoch": 2.2134831460674156, "grad_norm": 1.5692349672317505, "kl": 0.26814115047454834, "learning_rate": 9.759363681817094e-07, "loss": 0.0107, "reward": 1.6570625305175781, "reward_std": 0.9147831797599792, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.42268747091293335, "step": 4137 }, { "completion_length": 135.5625, "epoch": 2.2140181915462813, "grad_norm": 0.8658961057662964, "kl": 0.15547125041484833, "learning_rate": 9.747028607091537e-07, "loss": 0.0062, "reward": 1.8353124856948853, "reward_std": 0.8434851169586182, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.41343748569488525, "step": 4138 }, { "completion_length": 127.78125, "epoch": 2.214553237025147, "grad_norm": 0.88592529296875, "kl": 0.1532822996377945, "learning_rate": 9.734699444709974e-07, "loss": 0.0061, "reward": 2.057187557220459, "reward_std": 0.8670001029968262, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4478124976158142, "step": 4139 }, { "completion_length": 126.625, "epoch": 2.215088282504013, "grad_norm": 0.6062354445457458, "kl": 0.18426430225372314, "learning_rate": 9.722376199451437e-07, "loss": 0.0074, "reward": 2.3095624446868896, "reward_std": 0.5182982087135315, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4658125042915344, "step": 4140 }, { "completion_length": 134.59375, "epoch": 2.2156233279828785, "grad_norm": 1.1266117095947266, "kl": 0.191470667719841, "learning_rate": 9.710058876092642e-07, "loss": 0.0077, "reward": 1.9963124990463257, "reward_std": 0.7646960020065308, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4650624990463257, "step": 4141 }, { "completion_length": 126.78125, "epoch": 2.2161583734617443, "grad_norm": 0.47114238142967224, "kl": 0.15934514999389648, "learning_rate": 9.697747479408016e-07, "loss": 0.0064, "reward": 2.4496874809265137, "reward_std": 0.49209946393966675, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46531248092651367, "step": 4142 }, { "completion_length": 114.53125, "epoch": 2.21669341894061, "grad_norm": 0.6779097318649292, "kl": 0.1791231632232666, "learning_rate": 9.685442014169715e-07, "loss": 0.0072, "reward": 2.163281202316284, "reward_std": 0.6018934845924377, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.49140626192092896, "step": 4143 }, { "completion_length": 126.625, "epoch": 2.2172284644194757, "grad_norm": 0.8409955501556396, "kl": 0.1824134737253189, "learning_rate": 9.673142485147538e-07, "loss": 0.0073, "reward": 2.0437188148498535, "reward_std": 1.0646791458129883, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.44996875524520874, "step": 4144 }, { "completion_length": 138.78125, "epoch": 2.2177635098983415, "grad_norm": 4.274478435516357, "kl": 0.26404738426208496, "learning_rate": 9.660848897109047e-07, "loss": 0.0106, "reward": 1.262624979019165, "reward_std": 0.40343177318573, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.41887497901916504, "step": 4145 }, { "completion_length": 135.3125, "epoch": 2.218298555377207, "grad_norm": 897336.625, "kl": 4315.61279296875, "learning_rate": 9.648561254819457e-07, "loss": 172.6245, "reward": 1.7965624332427979, "reward_std": 0.6660240888595581, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4059374928474426, "step": 4146 }, { "completion_length": 118.96875, "epoch": 2.218833600856073, "grad_norm": 28.958404541015625, "kl": 0.45118358731269836, "learning_rate": 9.636279563041695e-07, "loss": 0.018, "reward": 2.7313125133514404, "reward_std": 0.660508394241333, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.45006251335144043, "step": 4147 }, { "completion_length": 125.375, "epoch": 2.2193686463349387, "grad_norm": 3.362076997756958, "kl": 0.22703740000724792, "learning_rate": 9.624003826536388e-07, "loss": 0.0091, "reward": 2.1875, "reward_std": 0.757490873336792, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 4148 }, { "completion_length": 135.71875, "epoch": 2.219903691813804, "grad_norm": 0.5008209943771362, "kl": 0.16945017874240875, "learning_rate": 9.611734050061844e-07, "loss": 0.0068, "reward": 2.0179061889648438, "reward_std": 0.6569714546203613, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4710312485694885, "step": 4149 }, { "completion_length": 139.28125, "epoch": 2.2204387372926697, "grad_norm": 0.6328690648078918, "kl": 0.14118927717208862, "learning_rate": 9.599470238374053e-07, "loss": 0.0056, "reward": 2.2934062480926514, "reward_std": 0.8930593132972717, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.43403124809265137, "step": 4150 }, { "completion_length": 124.625, "epoch": 2.2209737827715355, "grad_norm": 3.943782091140747, "kl": 0.4162469804286957, "learning_rate": 9.58721239622672e-07, "loss": 0.0166, "reward": 2.563218593597412, "reward_std": 0.7535111904144287, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.48509377241134644, "step": 4151 }, { "completion_length": 150.03125, "epoch": 2.221508828250401, "grad_norm": 1.3690462112426758, "kl": 0.22868306934833527, "learning_rate": 9.574960528371214e-07, "loss": 0.0091, "reward": 1.7687187194824219, "reward_std": 0.7584931254386902, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.37809374928474426, "step": 4152 }, { "completion_length": 131.0, "epoch": 2.222043873729267, "grad_norm": 2.2822611331939697, "kl": 0.17330369353294373, "learning_rate": 9.562714639556586e-07, "loss": 0.0069, "reward": 2.458937406539917, "reward_std": 0.7576466798782349, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.45893749594688416, "step": 4153 }, { "completion_length": 123.90625, "epoch": 2.2225789192081327, "grad_norm": 0.6656298637390137, "kl": 0.15863201022148132, "learning_rate": 9.550474734529578e-07, "loss": 0.0063, "reward": 2.1998438835144043, "reward_std": 0.8841109275817871, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48109376430511475, "step": 4154 }, { "completion_length": 140.8125, "epoch": 2.2231139646869984, "grad_norm": 1.3955565690994263, "kl": 0.18027132749557495, "learning_rate": 9.538240818034625e-07, "loss": 0.0072, "reward": 2.191093921661377, "reward_std": 0.7389665246009827, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4567187428474426, "step": 4155 }, { "completion_length": 136.4375, "epoch": 2.223649010165864, "grad_norm": 5.878931522369385, "kl": 0.4025076925754547, "learning_rate": 9.526012894813815e-07, "loss": 0.0161, "reward": 1.2864999771118164, "reward_std": 0.5841795802116394, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.3802500069141388, "step": 4156 }, { "completion_length": 115.5625, "epoch": 2.22418405564473, "grad_norm": 1.060782551765442, "kl": 0.17778241634368896, "learning_rate": 9.513790969606926e-07, "loss": 0.0071, "reward": 2.473562479019165, "reward_std": 0.8164494037628174, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4891875088214874, "step": 4157 }, { "completion_length": 127.65625, "epoch": 2.2247191011235956, "grad_norm": 1.1676663160324097, "kl": 0.21899619698524475, "learning_rate": 9.501575047151426e-07, "loss": 0.0088, "reward": 1.6430625915527344, "reward_std": 1.0171878337860107, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4243125319480896, "step": 4158 }, { "completion_length": 131.40625, "epoch": 2.2252541466024613, "grad_norm": 0.6558420062065125, "kl": 0.17080536484718323, "learning_rate": 9.489365132182412e-07, "loss": 0.0068, "reward": 1.77134370803833, "reward_std": 0.8808650970458984, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.45884376764297485, "step": 4159 }, { "completion_length": 114.375, "epoch": 2.225789192081327, "grad_norm": 1.0152603387832642, "kl": 0.17757976055145264, "learning_rate": 9.477161229432696e-07, "loss": 0.0071, "reward": 2.190718650817871, "reward_std": 0.8793278932571411, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.48759377002716064, "step": 4160 }, { "completion_length": 120.84375, "epoch": 2.226324237560193, "grad_norm": 1.7311930656433105, "kl": 0.20847588777542114, "learning_rate": 9.464963343632766e-07, "loss": 0.0083, "reward": 1.6151249408721924, "reward_std": 0.6619977951049805, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.45887500047683716, "step": 4161 }, { "completion_length": 121.71875, "epoch": 2.226859283039058, "grad_norm": 2.566145181655884, "kl": 0.3926162123680115, "learning_rate": 9.452771479510717e-07, "loss": 0.0157, "reward": 2.7407188415527344, "reward_std": 0.7431434988975525, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4907187521457672, "step": 4162 }, { "completion_length": 132.4375, "epoch": 2.227394328517924, "grad_norm": 1.8958100080490112, "kl": 0.16334420442581177, "learning_rate": 9.440585641792377e-07, "loss": 0.0065, "reward": 2.029250144958496, "reward_std": 1.0121731758117676, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.43550002574920654, "step": 4163 }, { "completion_length": 121.15625, "epoch": 2.2279293739967896, "grad_norm": 0.9465434551239014, "kl": 0.19700637459754944, "learning_rate": 9.428405835201201e-07, "loss": 0.0079, "reward": 2.2198750972747803, "reward_std": 0.6197822690010071, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4386249780654907, "step": 4164 }, { "completion_length": 105.28125, "epoch": 2.2284644194756553, "grad_norm": 1.11802339553833, "kl": 0.2285808026790619, "learning_rate": 9.416232064458305e-07, "loss": 0.0091, "reward": 2.4919686317443848, "reward_std": 0.26388999819755554, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4919687509536743, "step": 4165 }, { "completion_length": 116.3125, "epoch": 2.228999464954521, "grad_norm": 3.4028260707855225, "kl": 0.2592679560184479, "learning_rate": 9.404064334282498e-07, "loss": 0.0104, "reward": 1.8995312452316284, "reward_std": 0.8790575265884399, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4776562452316284, "step": 4166 }, { "completion_length": 117.375, "epoch": 2.229534510433387, "grad_norm": 6.951387405395508, "kl": 0.42751309275627136, "learning_rate": 9.391902649390213e-07, "loss": 0.0171, "reward": 1.7462186813354492, "reward_std": 0.6993128061294556, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.449343740940094, "step": 4167 }, { "completion_length": 133.96875, "epoch": 2.2300695559122525, "grad_norm": 0.6267269849777222, "kl": 0.1781543642282486, "learning_rate": 9.379747014495544e-07, "loss": 0.0071, "reward": 2.1265311241149902, "reward_std": 0.9201005697250366, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4546562433242798, "step": 4168 }, { "completion_length": 142.8125, "epoch": 2.2306046013911183, "grad_norm": 3.5209617614746094, "kl": 0.18433310091495514, "learning_rate": 9.367597434310266e-07, "loss": 0.0074, "reward": 2.2517499923706055, "reward_std": 0.4368603825569153, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.43924999237060547, "step": 4169 }, { "completion_length": 138.0, "epoch": 2.231139646869984, "grad_norm": 1.3575656414031982, "kl": 0.14987003803253174, "learning_rate": 9.35545391354378e-07, "loss": 0.006, "reward": 2.4468436241149902, "reward_std": 0.7219992876052856, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4780937433242798, "step": 4170 }, { "completion_length": 162.75, "epoch": 2.2316746923488497, "grad_norm": 1.4236559867858887, "kl": 0.21888552606105804, "learning_rate": 9.343316456903148e-07, "loss": 0.0088, "reward": 1.4292500019073486, "reward_std": 0.9393863677978516, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.38237500190734863, "step": 4171 }, { "completion_length": 132.6875, "epoch": 2.2322097378277155, "grad_norm": 1.0041979551315308, "kl": 0.1913388967514038, "learning_rate": 9.331185069093071e-07, "loss": 0.0077, "reward": 2.1446876525878906, "reward_std": 0.8266239166259766, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4728125035762787, "step": 4172 }, { "completion_length": 135.4375, "epoch": 2.232744783306581, "grad_norm": 1.4064911603927612, "kl": 0.21624699234962463, "learning_rate": 9.319059754815926e-07, "loss": 0.0086, "reward": 1.798875093460083, "reward_std": 0.8875856399536133, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.39262500405311584, "step": 4173 }, { "completion_length": 131.875, "epoch": 2.233279828785447, "grad_norm": 1.4176243543624878, "kl": 0.2167031466960907, "learning_rate": 9.30694051877171e-07, "loss": 0.0087, "reward": 1.6949999332427979, "reward_std": 0.5306625366210938, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4606249928474426, "step": 4174 }, { "completion_length": 152.875, "epoch": 2.2338148742643122, "grad_norm": 3.567392110824585, "kl": 0.18838131427764893, "learning_rate": 9.294827365658057e-07, "loss": 0.0075, "reward": 1.449625015258789, "reward_std": 0.9133574962615967, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.4183749854564667, "step": 4175 }, { "completion_length": 130.5625, "epoch": 2.234349919743178, "grad_norm": 18.319753646850586, "kl": 0.4432132840156555, "learning_rate": 9.282720300170278e-07, "loss": 0.0177, "reward": 1.895187497138977, "reward_std": 0.8623454570770264, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.42643749713897705, "step": 4176 }, { "completion_length": 114.1875, "epoch": 2.2348849652220437, "grad_norm": 2.2472774982452393, "kl": 0.25670677423477173, "learning_rate": 9.270619327001295e-07, "loss": 0.0103, "reward": 2.2959060668945312, "reward_std": 0.6949079036712646, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4834062457084656, "step": 4177 }, { "completion_length": 118.5, "epoch": 2.2354200107009095, "grad_norm": 1.2661679983139038, "kl": 0.17996186017990112, "learning_rate": 9.258524450841669e-07, "loss": 0.0072, "reward": 2.4481873512268066, "reward_std": 0.7972003221511841, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4794375002384186, "step": 4178 }, { "completion_length": 115.5, "epoch": 2.235955056179775, "grad_norm": 0.704390823841095, "kl": 0.2317044734954834, "learning_rate": 9.246435676379622e-07, "loss": 0.0093, "reward": 2.2885000705718994, "reward_std": 0.8273105621337891, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.49162501096725464, "step": 4179 }, { "completion_length": 134.15625, "epoch": 2.236490101658641, "grad_norm": 1.894319772720337, "kl": 0.16325795650482178, "learning_rate": 9.234353008300986e-07, "loss": 0.0065, "reward": 1.6418750286102295, "reward_std": 1.04558265209198, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4074999988079071, "step": 4180 }, { "completion_length": 137.59375, "epoch": 2.2370251471375067, "grad_norm": 1.3096555471420288, "kl": 0.15357564389705658, "learning_rate": 9.222276451289236e-07, "loss": 0.0061, "reward": 2.112468719482422, "reward_std": 0.9198657870292664, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47184374928474426, "step": 4181 }, { "completion_length": 121.40625, "epoch": 2.2375601926163724, "grad_norm": 1.289530634880066, "kl": 0.22582687437534332, "learning_rate": 9.210206010025466e-07, "loss": 0.009, "reward": 2.3901875019073486, "reward_std": 0.8841509222984314, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46831250190734863, "step": 4182 }, { "completion_length": 101.0, "epoch": 2.238095238095238, "grad_norm": 0.6832019090652466, "kl": 0.2236192524433136, "learning_rate": 9.19814168918843e-07, "loss": 0.0089, "reward": 3.078249931335449, "reward_std": 0.3636839985847473, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.484499990940094, "step": 4183 }, { "completion_length": 127.53125, "epoch": 2.238630283574104, "grad_norm": 0.686382532119751, "kl": 0.19974371790885925, "learning_rate": 9.186083493454481e-07, "loss": 0.008, "reward": 2.4685001373291016, "reward_std": 0.9106685519218445, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.499750018119812, "step": 4184 }, { "completion_length": 149.09375, "epoch": 2.2391653290529696, "grad_norm": 4.523770332336426, "kl": 0.26235008239746094, "learning_rate": 9.174031427497595e-07, "loss": 0.0105, "reward": 1.8413749933242798, "reward_std": 1.000908374786377, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4663749933242798, "step": 4185 }, { "completion_length": 129.65625, "epoch": 2.2397003745318353, "grad_norm": 0.8106535077095032, "kl": 0.17434215545654297, "learning_rate": 9.161985495989412e-07, "loss": 0.007, "reward": 2.11328125, "reward_std": 0.7763692140579224, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.44140625, "step": 4186 }, { "completion_length": 127.375, "epoch": 2.240235420010701, "grad_norm": 0.8003013134002686, "kl": 0.22053970396518707, "learning_rate": 9.149945703599133e-07, "loss": 0.0088, "reward": 2.163468837738037, "reward_std": 0.8369985818862915, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46034371852874756, "step": 4187 }, { "completion_length": 122.90625, "epoch": 2.240770465489567, "grad_norm": 1.4239356517791748, "kl": 0.21147724986076355, "learning_rate": 9.137912054993628e-07, "loss": 0.0085, "reward": 2.0908751487731934, "reward_std": 0.8145641684532166, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4502500295639038, "step": 4188 }, { "completion_length": 112.53125, "epoch": 2.241305510968432, "grad_norm": 1.4168392419815063, "kl": 0.2116178572177887, "learning_rate": 9.125884554837386e-07, "loss": 0.0085, "reward": 3.07240629196167, "reward_std": 0.8486925363540649, "rewards/correctness_reward_func": 1.6875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.44740623235702515, "step": 4189 }, { "completion_length": 121.03125, "epoch": 2.241840556447298, "grad_norm": 1.0777424573898315, "kl": 0.215006023645401, "learning_rate": 9.113863207792462e-07, "loss": 0.0086, "reward": 2.45521879196167, "reward_std": 0.9800761938095093, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.45521873235702515, "step": 4190 }, { "completion_length": 124.71875, "epoch": 2.2423756019261636, "grad_norm": 0.8336305022239685, "kl": 0.18260134756565094, "learning_rate": 9.101848018518586e-07, "loss": 0.0073, "reward": 2.458750009536743, "reward_std": 1.001068353652954, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.47437500953674316, "step": 4191 }, { "completion_length": 133.625, "epoch": 2.2429106474050293, "grad_norm": 1.2018966674804688, "kl": 0.2032637745141983, "learning_rate": 9.089838991673069e-07, "loss": 0.0081, "reward": 1.7553749084472656, "reward_std": 0.8789360523223877, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4428749978542328, "step": 4192 }, { "completion_length": 124.96875, "epoch": 2.243445692883895, "grad_norm": 1.3620595932006836, "kl": 0.27911829948425293, "learning_rate": 9.077836131910833e-07, "loss": 0.0112, "reward": 2.4928436279296875, "reward_std": 1.0830726623535156, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.46159374713897705, "step": 4193 }, { "completion_length": 102.25, "epoch": 2.243980738362761, "grad_norm": 0.4720451235771179, "kl": 0.17211750149726868, "learning_rate": 9.065839443884431e-07, "loss": 0.0069, "reward": 3.008500099182129, "reward_std": 0.2619890570640564, "rewards/correctness_reward_func": 1.5625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.49287500977516174, "step": 4194 }, { "completion_length": 131.375, "epoch": 2.2445157838416265, "grad_norm": 1.2997952699661255, "kl": 0.1749456822872162, "learning_rate": 9.053848932243997e-07, "loss": 0.007, "reward": 1.7477500438690186, "reward_std": 0.7230024337768555, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4821249842643738, "step": 4195 }, { "completion_length": 127.96875, "epoch": 2.2450508293204923, "grad_norm": 1.009709119796753, "kl": 0.2369893193244934, "learning_rate": 9.04186460163729e-07, "loss": 0.0095, "reward": 2.368406295776367, "reward_std": 0.7941848635673523, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4777812361717224, "step": 4196 }, { "completion_length": 132.65625, "epoch": 2.245585874799358, "grad_norm": 0.45673587918281555, "kl": 0.17089997231960297, "learning_rate": 9.029886456709653e-07, "loss": 0.0068, "reward": 2.7614688873291016, "reward_std": 0.405398428440094, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.464593768119812, "step": 4197 }, { "completion_length": 121.75, "epoch": 2.2461209202782237, "grad_norm": 2.7593178749084473, "kl": 0.2550226151943207, "learning_rate": 9.017914502104064e-07, "loss": 0.0102, "reward": 1.9176561832427979, "reward_std": 0.8263033628463745, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4645312428474426, "step": 4198 }, { "completion_length": 146.0625, "epoch": 2.2466559657570895, "grad_norm": 0.9025117754936218, "kl": 0.24218150973320007, "learning_rate": 9.005948742461071e-07, "loss": 0.0097, "reward": 1.974562406539917, "reward_std": 0.5457722544670105, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.42768749594688416, "step": 4199 }, { "completion_length": 133.9375, "epoch": 2.247191011235955, "grad_norm": 0.7688866257667542, "kl": 0.19754484295845032, "learning_rate": 8.993989182418824e-07, "loss": 0.0079, "reward": 1.828125, "reward_std": 0.33351296186447144, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.5, "step": 4200 }, { "completion_length": 111.28125, "epoch": 2.247726056714821, "grad_norm": 0.9303037524223328, "kl": 0.20261546969413757, "learning_rate": 8.982035826613098e-07, "loss": 0.0081, "reward": 1.9036250114440918, "reward_std": 0.830307126045227, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4817500114440918, "step": 4201 }, { "completion_length": 119.09375, "epoch": 2.2482611021936867, "grad_norm": 0.6187859177589417, "kl": 0.17903712391853333, "learning_rate": 8.97008867967723e-07, "loss": 0.0072, "reward": 2.67368745803833, "reward_std": 0.2937130630016327, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48618751764297485, "step": 4202 }, { "completion_length": 139.1875, "epoch": 2.248796147672552, "grad_norm": 1.263160228729248, "kl": 0.18418969213962555, "learning_rate": 8.958147746242157e-07, "loss": 0.0074, "reward": 2.387624979019165, "reward_std": 0.7507980465888977, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4970000088214874, "step": 4203 }, { "completion_length": 113.8125, "epoch": 2.2493311931514177, "grad_norm": 0.7127302289009094, "kl": 0.267203688621521, "learning_rate": 8.94621303093644e-07, "loss": 0.0107, "reward": 3.2307186126708984, "reward_std": 0.48429030179977417, "rewards/correctness_reward_func": 1.8125, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.480718731880188, "step": 4204 }, { "completion_length": 129.125, "epoch": 2.2498662386302835, "grad_norm": 0.5496440529823303, "kl": 0.15658576786518097, "learning_rate": 8.934284538386165e-07, "loss": 0.0063, "reward": 2.374187469482422, "reward_std": 0.6382167935371399, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.43668749928474426, "step": 4205 }, { "completion_length": 139.09375, "epoch": 2.250401284109149, "grad_norm": 1.1647460460662842, "kl": 0.1727679967880249, "learning_rate": 8.922362273215065e-07, "loss": 0.0069, "reward": 1.3672499656677246, "reward_std": 0.4684998691082001, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4610000252723694, "step": 4206 }, { "completion_length": 130.75, "epoch": 2.250936329588015, "grad_norm": 0.7189461588859558, "kl": 0.17574262619018555, "learning_rate": 8.910446240044454e-07, "loss": 0.007, "reward": 2.1155312061309814, "reward_std": 0.6907718181610107, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4905312657356262, "step": 4207 }, { "completion_length": 120.15625, "epoch": 2.2514713750668807, "grad_norm": 0.8151649832725525, "kl": 0.21384617686271667, "learning_rate": 8.898536443493183e-07, "loss": 0.0086, "reward": 1.4466562271118164, "reward_std": 0.6430436372756958, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4622812569141388, "step": 4208 }, { "completion_length": 119.84375, "epoch": 2.2520064205457464, "grad_norm": 0.45986345410346985, "kl": 0.19575448334217072, "learning_rate": 8.886632888177741e-07, "loss": 0.0078, "reward": 2.21875, "reward_std": 0.32512497901916504, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.5, "step": 4209 }, { "completion_length": 124.28125, "epoch": 2.252541466024612, "grad_norm": 1.7465550899505615, "kl": 0.18732786178588867, "learning_rate": 8.874735578712162e-07, "loss": 0.0075, "reward": 1.7427188158035278, "reward_std": 0.9983617663383484, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.43021875619888306, "step": 4210 }, { "completion_length": 127.1875, "epoch": 2.253076511503478, "grad_norm": 0.7500036358833313, "kl": 0.15495619177818298, "learning_rate": 8.862844519708069e-07, "loss": 0.0062, "reward": 2.302093744277954, "reward_std": 0.791589617729187, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4583437442779541, "step": 4211 }, { "completion_length": 117.78125, "epoch": 2.2536115569823436, "grad_norm": 1.4124021530151367, "kl": 0.17153069376945496, "learning_rate": 8.850959715774679e-07, "loss": 0.0069, "reward": 1.792718768119812, "reward_std": 0.9901548624038696, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.464593768119812, "step": 4212 }, { "completion_length": 122.09375, "epoch": 2.2541466024612093, "grad_norm": 0.9085552096366882, "kl": 0.2579324245452881, "learning_rate": 8.839081171518762e-07, "loss": 0.0103, "reward": 2.2638750076293945, "reward_std": 1.1366841793060303, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46700000762939453, "step": 4213 }, { "completion_length": 127.9375, "epoch": 2.254681647940075, "grad_norm": 1.40972900390625, "kl": 0.18727333843708038, "learning_rate": 8.827208891544667e-07, "loss": 0.0075, "reward": 2.6166250705718994, "reward_std": 1.0249254703521729, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.49162501096725464, "step": 4214 }, { "completion_length": 128.375, "epoch": 2.255216693418941, "grad_norm": 1.114972472190857, "kl": 0.23734024167060852, "learning_rate": 8.815342880454312e-07, "loss": 0.0095, "reward": 2.1013436317443848, "reward_std": 0.7232005596160889, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4607187509536743, "step": 4215 }, { "completion_length": 133.09375, "epoch": 2.255751738897806, "grad_norm": 1.0683090686798096, "kl": 0.18867890536785126, "learning_rate": 8.803483142847203e-07, "loss": 0.0075, "reward": 2.514500141143799, "reward_std": 0.6521050333976746, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4832499921321869, "step": 4216 }, { "completion_length": 117.4375, "epoch": 2.256286784376672, "grad_norm": 0.6262534260749817, "kl": 0.17480607330799103, "learning_rate": 8.791629683320394e-07, "loss": 0.007, "reward": 3.25390625, "reward_std": 0.5217167735099792, "rewards/correctness_reward_func": 1.8125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48828125, "step": 4217 }, { "completion_length": 155.46875, "epoch": 2.2568218298555376, "grad_norm": 0.9704773426055908, "kl": 0.15912507474422455, "learning_rate": 8.77978250646851e-07, "loss": 0.0064, "reward": 1.0108749866485596, "reward_std": 0.5694820880889893, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.25, "rewards/xmlcount_reward_func": 0.37024998664855957, "step": 4218 }, { "completion_length": 131.6875, "epoch": 2.2573568753344033, "grad_norm": 0.8672210574150085, "kl": 0.1690402626991272, "learning_rate": 8.767941616883754e-07, "loss": 0.0068, "reward": 2.1761250495910645, "reward_std": 0.9974169135093689, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.42612501978874207, "step": 4219 }, { "completion_length": 123.5, "epoch": 2.257891920813269, "grad_norm": 1.5142595767974854, "kl": 0.20598265528678894, "learning_rate": 8.756107019155877e-07, "loss": 0.0082, "reward": 2.33203125, "reward_std": 0.7202503681182861, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48828125, "step": 4220 }, { "completion_length": 150.3125, "epoch": 2.258426966292135, "grad_norm": 0.7673019170761108, "kl": 0.178249791264534, "learning_rate": 8.744278717872185e-07, "loss": 0.0071, "reward": 1.6219375133514404, "reward_std": 1.0324571132659912, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.35631251335144043, "step": 4221 }, { "completion_length": 128.75, "epoch": 2.2589620117710005, "grad_norm": 1.310672640800476, "kl": 0.15929445624351501, "learning_rate": 8.732456717617574e-07, "loss": 0.0064, "reward": 2.1396560668945312, "reward_std": 0.9244015216827393, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4521562457084656, "step": 4222 }, { "completion_length": 101.5, "epoch": 2.2594970572498663, "grad_norm": 0.9546377062797546, "kl": 0.21346089243888855, "learning_rate": 8.720641022974466e-07, "loss": 0.0085, "reward": 2.205718994140625, "reward_std": 0.47396695613861084, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4869687557220459, "step": 4223 }, { "completion_length": 118.375, "epoch": 2.260032102728732, "grad_norm": 0.8687043786048889, "kl": 0.19787932932376862, "learning_rate": 8.708831638522855e-07, "loss": 0.0079, "reward": 2.701906204223633, "reward_std": 0.5901663303375244, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4831562638282776, "step": 4224 }, { "completion_length": 110.21875, "epoch": 2.2605671482075977, "grad_norm": 1.013145923614502, "kl": 0.2556116282939911, "learning_rate": 8.697028568840274e-07, "loss": 0.0102, "reward": 2.081531047821045, "reward_std": 0.8154599070549011, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.47215625643730164, "step": 4225 }, { "completion_length": 102.34375, "epoch": 2.2611021936864635, "grad_norm": 1.1840275526046753, "kl": 0.3049461245536804, "learning_rate": 8.685231818501832e-07, "loss": 0.0122, "reward": 2.339062452316284, "reward_std": 0.9745151996612549, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.44843751192092896, "step": 4226 }, { "completion_length": 151.34375, "epoch": 2.261637239165329, "grad_norm": 1.0439777374267578, "kl": 0.2268865704536438, "learning_rate": 8.673441392080173e-07, "loss": 0.0091, "reward": 1.8610937595367432, "reward_std": 0.921083927154541, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.48609375953674316, "step": 4227 }, { "completion_length": 126.34375, "epoch": 2.262172284644195, "grad_norm": 49.30668640136719, "kl": 2.9673991203308105, "learning_rate": 8.661657294145485e-07, "loss": 0.1187, "reward": 2.2340312004089355, "reward_std": 0.7299951910972595, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4527812600135803, "step": 4228 }, { "completion_length": 137.84375, "epoch": 2.2627073301230602, "grad_norm": 1.3085753917694092, "kl": 0.231641948223114, "learning_rate": 8.649879529265526e-07, "loss": 0.0093, "reward": 1.4247187376022339, "reward_std": 0.5987830758094788, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4559687376022339, "step": 4229 }, { "completion_length": 145.65625, "epoch": 2.263242375601926, "grad_norm": 0.864072322845459, "kl": 0.15897813439369202, "learning_rate": 8.63810810200556e-07, "loss": 0.0064, "reward": 2.016124963760376, "reward_std": 0.5997549295425415, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.45362499356269836, "step": 4230 }, { "completion_length": 129.75, "epoch": 2.2637774210807917, "grad_norm": 1.5876288414001465, "kl": 0.31853726506233215, "learning_rate": 8.626343016928429e-07, "loss": 0.0127, "reward": 1.703125, "reward_std": 0.99457186460495, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4375, "step": 4231 }, { "completion_length": 139.28125, "epoch": 2.2643124665596575, "grad_norm": 1.4181972742080688, "kl": 0.23753516376018524, "learning_rate": 8.614584278594524e-07, "loss": 0.0095, "reward": 1.4013750553131104, "reward_std": 0.7738407850265503, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.43262502551078796, "step": 4232 }, { "completion_length": 158.78125, "epoch": 2.264847512038523, "grad_norm": 4.940239429473877, "kl": 0.5360076427459717, "learning_rate": 8.60283189156172e-07, "loss": 0.0214, "reward": 1.055281162261963, "reward_std": 0.730968713760376, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.234375, "rewards/xmlcount_reward_func": 0.33653125166893005, "step": 4233 }, { "completion_length": 119.09375, "epoch": 2.265382557517389, "grad_norm": 1.072503685951233, "kl": 0.1701306700706482, "learning_rate": 8.591085860385498e-07, "loss": 0.0068, "reward": 2.358468770980835, "reward_std": 0.23856914043426514, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46784377098083496, "step": 4234 }, { "completion_length": 134.03125, "epoch": 2.2659176029962547, "grad_norm": 0.7748817801475525, "kl": 0.22273264825344086, "learning_rate": 8.579346189618834e-07, "loss": 0.0089, "reward": 1.8240625858306885, "reward_std": 1.0595588684082031, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4334374964237213, "step": 4235 }, { "completion_length": 108.8125, "epoch": 2.2664526484751204, "grad_norm": 3.2658333778381348, "kl": 0.23463232815265656, "learning_rate": 8.567612883812243e-07, "loss": 0.0094, "reward": 2.2770938873291016, "reward_std": 0.927620530128479, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4645937383174896, "step": 4236 }, { "completion_length": 103.875, "epoch": 2.266987693953986, "grad_norm": 0.9481930136680603, "kl": 0.2121264934539795, "learning_rate": 8.555885947513792e-07, "loss": 0.0085, "reward": 2.1542811393737793, "reward_std": 0.7162657976150513, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46678125858306885, "step": 4237 }, { "completion_length": 104.875, "epoch": 2.267522739432852, "grad_norm": 1.8981267213821411, "kl": 0.2155085802078247, "learning_rate": 8.544165385269059e-07, "loss": 0.0086, "reward": 2.5067501068115234, "reward_std": 0.7586804628372192, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4911250174045563, "step": 4238 }, { "completion_length": 113.28125, "epoch": 2.2680577849117176, "grad_norm": 1.8102290630340576, "kl": 0.18521298468112946, "learning_rate": 8.532451201621156e-07, "loss": 0.0074, "reward": 2.8125, "reward_std": 0.8233535885810852, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.46875, "step": 4239 }, { "completion_length": 113.15625, "epoch": 2.2685928303905833, "grad_norm": 1.9563379287719727, "kl": 0.24257205426692963, "learning_rate": 8.520743401110732e-07, "loss": 0.0097, "reward": 2.6419999599456787, "reward_std": 0.6594400405883789, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4545000195503235, "step": 4240 }, { "completion_length": 104.0, "epoch": 2.269127875869449, "grad_norm": 1.2168017625808716, "kl": 0.41079241037368774, "learning_rate": 8.509041988275954e-07, "loss": 0.0164, "reward": 2.6189687252044678, "reward_std": 1.1486068964004517, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.47834375500679016, "step": 4241 }, { "completion_length": 129.75, "epoch": 2.2696629213483144, "grad_norm": 1.3750802278518677, "kl": 0.37290331721305847, "learning_rate": 8.497346967652509e-07, "loss": 0.0149, "reward": 2.1239376068115234, "reward_std": 0.7344678640365601, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4676874876022339, "step": 4242 }, { "completion_length": 123.8125, "epoch": 2.2701979668271806, "grad_norm": 1.8443939685821533, "kl": 0.22231443226337433, "learning_rate": 8.485658343773607e-07, "loss": 0.0089, "reward": 2.684875011444092, "reward_std": 0.8107039332389832, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4661249816417694, "step": 4243 }, { "completion_length": 129.46875, "epoch": 2.270733012306046, "grad_norm": 0.8919531106948853, "kl": 0.1657123565673828, "learning_rate": 8.473976121169997e-07, "loss": 0.0066, "reward": 2.259031295776367, "reward_std": 0.8096110820770264, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4465312361717224, "step": 4244 }, { "completion_length": 132.9375, "epoch": 2.2712680577849116, "grad_norm": 1.4537224769592285, "kl": 0.15982919931411743, "learning_rate": 8.462300304369922e-07, "loss": 0.0064, "reward": 2.634718894958496, "reward_std": 0.5878229141235352, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.44721877574920654, "step": 4245 }, { "completion_length": 117.125, "epoch": 2.2718031032637773, "grad_norm": 0.860106348991394, "kl": 0.2509957551956177, "learning_rate": 8.450630897899148e-07, "loss": 0.01, "reward": 2.6324377059936523, "reward_std": 0.7373075485229492, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.44493749737739563, "step": 4246 }, { "completion_length": 136.6875, "epoch": 2.272338148742643, "grad_norm": 0.7015721201896667, "kl": 0.14425978064537048, "learning_rate": 8.438967906280976e-07, "loss": 0.0058, "reward": 1.9564685821533203, "reward_std": 0.9437536597251892, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.47209376096725464, "step": 4247 }, { "completion_length": 144.625, "epoch": 2.272873194221509, "grad_norm": 0.49276378750801086, "kl": 0.13231956958770752, "learning_rate": 8.427311334036195e-07, "loss": 0.0053, "reward": 2.0952811241149902, "reward_std": 0.9682343602180481, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4390312433242798, "step": 4248 }, { "completion_length": 144.6875, "epoch": 2.2734082397003745, "grad_norm": 1.2194023132324219, "kl": 0.15913788974285126, "learning_rate": 8.415661185683108e-07, "loss": 0.0064, "reward": 1.5600311756134033, "reward_std": 0.8896865248680115, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4350312352180481, "step": 4249 }, { "completion_length": 146.3125, "epoch": 2.2739432851792403, "grad_norm": 0.9459198117256165, "kl": 0.1929420828819275, "learning_rate": 8.404017465737555e-07, "loss": 0.0077, "reward": 2.00362491607666, "reward_std": 1.210574984550476, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.40987497568130493, "step": 4250 }, { "completion_length": 124.96875, "epoch": 2.274478330658106, "grad_norm": 1.7556403875350952, "kl": 0.20625263452529907, "learning_rate": 8.392380178712853e-07, "loss": 0.0083, "reward": 2.6764063835144043, "reward_std": 0.7800861597061157, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.47328126430511475, "step": 4251 }, { "completion_length": 110.34375, "epoch": 2.2750133761369717, "grad_norm": 0.74220871925354, "kl": 0.15663208067417145, "learning_rate": 8.380749329119842e-07, "loss": 0.0063, "reward": 2.757937431335449, "reward_std": 1.095040202140808, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.461062490940094, "step": 4252 }, { "completion_length": 120.53125, "epoch": 2.2755484216158375, "grad_norm": 2.033399820327759, "kl": 0.226637065410614, "learning_rate": 8.369124921466848e-07, "loss": 0.0091, "reward": 2.4140625, "reward_std": 0.6072378158569336, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4921875, "step": 4253 }, { "completion_length": 122.71875, "epoch": 2.276083467094703, "grad_norm": 1.203147292137146, "kl": 0.20336748659610748, "learning_rate": 8.357506960259734e-07, "loss": 0.0081, "reward": 2.444187641143799, "reward_std": 0.4429359436035156, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.4910624921321869, "step": 4254 }, { "completion_length": 125.25, "epoch": 2.2766185125735685, "grad_norm": 2.0397322177886963, "kl": 0.21472680568695068, "learning_rate": 8.345895450001837e-07, "loss": 0.0086, "reward": 2.628000020980835, "reward_std": 1.043805718421936, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.44050002098083496, "step": 4255 }, { "completion_length": 102.78125, "epoch": 2.2771535580524347, "grad_norm": 0.5869008302688599, "kl": 0.2365533411502838, "learning_rate": 8.334290395193991e-07, "loss": 0.0095, "reward": 2.569906234741211, "reward_std": 0.6710084676742554, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47615623474121094, "step": 4256 }, { "completion_length": 136.0625, "epoch": 2.2776886035313, "grad_norm": 1.2840242385864258, "kl": 0.20514604449272156, "learning_rate": 8.322691800334562e-07, "loss": 0.0082, "reward": 2.0114998817443848, "reward_std": 0.8350412845611572, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4333750009536743, "step": 4257 }, { "completion_length": 125.375, "epoch": 2.2782236490101657, "grad_norm": 3.572598695755005, "kl": 0.19276614487171173, "learning_rate": 8.311099669919356e-07, "loss": 0.0077, "reward": 2.4188125133514404, "reward_std": 0.8778727650642395, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.45006251335144043, "step": 4258 }, { "completion_length": 152.84375, "epoch": 2.2787586944890315, "grad_norm": 2.0694668292999268, "kl": 0.174050435423851, "learning_rate": 8.29951400844173e-07, "loss": 0.007, "reward": 1.7117500305175781, "reward_std": 0.7472203969955444, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.44612500071525574, "step": 4259 }, { "completion_length": 139.375, "epoch": 2.279293739967897, "grad_norm": 0.6607476472854614, "kl": 0.14912118017673492, "learning_rate": 8.287934820392499e-07, "loss": 0.006, "reward": 2.5219063758850098, "reward_std": 0.7558858394622803, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4594062566757202, "step": 4260 }, { "completion_length": 110.28125, "epoch": 2.279828785446763, "grad_norm": 2.0279831886291504, "kl": 0.394706666469574, "learning_rate": 8.276362110259969e-07, "loss": 0.0158, "reward": 2.616468906402588, "reward_std": 1.1229467391967773, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.47584372758865356, "step": 4261 }, { "completion_length": 116.1875, "epoch": 2.2803638309256287, "grad_norm": 0.8950337767601013, "kl": 0.2532932758331299, "learning_rate": 8.264795882529964e-07, "loss": 0.0101, "reward": 2.7755937576293945, "reward_std": 0.8210105895996094, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47871875762939453, "step": 4262 }, { "completion_length": 148.40625, "epoch": 2.2808988764044944, "grad_norm": 2.7808191776275635, "kl": 0.3389130234718323, "learning_rate": 8.253236141685764e-07, "loss": 0.0136, "reward": 1.4054999351501465, "reward_std": 0.7443667650222778, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.43674999475479126, "step": 4263 }, { "completion_length": 127.0, "epoch": 2.28143392188336, "grad_norm": 0.4104469418525696, "kl": 0.17441165447235107, "learning_rate": 8.241682892208139e-07, "loss": 0.007, "reward": 2.11328125, "reward_std": 0.4184677004814148, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48828125, "step": 4264 }, { "completion_length": 134.375, "epoch": 2.281968967362226, "grad_norm": 5.9869585037231445, "kl": 0.6109585762023926, "learning_rate": 8.230136138575367e-07, "loss": 0.0244, "reward": 1.7427186965942383, "reward_std": 0.47081679105758667, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.46146875619888306, "step": 4265 }, { "completion_length": 126.25, "epoch": 2.2825040128410916, "grad_norm": 0.7246792316436768, "kl": 0.24006615579128265, "learning_rate": 8.218595885263181e-07, "loss": 0.0096, "reward": 2.296875, "reward_std": 0.5735911726951599, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.46875, "step": 4266 }, { "completion_length": 142.71875, "epoch": 2.2830390583199573, "grad_norm": 1.1103159189224243, "kl": 0.15715892612934113, "learning_rate": 8.207062136744806e-07, "loss": 0.0063, "reward": 2.3324999809265137, "reward_std": 0.6759911179542542, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.44187501072883606, "step": 4267 }, { "completion_length": 134.4375, "epoch": 2.283574103798823, "grad_norm": 0.695755660533905, "kl": 0.20211739838123322, "learning_rate": 8.195534897490936e-07, "loss": 0.0081, "reward": 2.212125062942505, "reward_std": 1.1234303712844849, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4621250033378601, "step": 4268 }, { "completion_length": 119.875, "epoch": 2.284109149277689, "grad_norm": 1.0653200149536133, "kl": 0.18787413835525513, "learning_rate": 8.184014171969765e-07, "loss": 0.0075, "reward": 2.7687811851501465, "reward_std": 0.5318870544433594, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48753124475479126, "step": 4269 }, { "completion_length": 151.8125, "epoch": 2.284644194756554, "grad_norm": 1.7973204851150513, "kl": 0.17589551210403442, "learning_rate": 8.172499964646938e-07, "loss": 0.007, "reward": 2.0801563262939453, "reward_std": 0.9811573028564453, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.39265626668930054, "step": 4270 }, { "completion_length": 131.65625, "epoch": 2.28517924023542, "grad_norm": 0.7276715636253357, "kl": 0.2084893137216568, "learning_rate": 8.160992279985578e-07, "loss": 0.0083, "reward": 2.047781229019165, "reward_std": 0.8486930727958679, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4696562588214874, "step": 4271 }, { "completion_length": 128.875, "epoch": 2.2857142857142856, "grad_norm": 1.6120976209640503, "kl": 0.16029168665409088, "learning_rate": 8.149491122446304e-07, "loss": 0.0064, "reward": 2.4565000534057617, "reward_std": 0.48672935366630554, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47212499380111694, "step": 4272 }, { "completion_length": 120.5625, "epoch": 2.2862493311931513, "grad_norm": 0.7199530005455017, "kl": 0.20290839672088623, "learning_rate": 8.137996496487155e-07, "loss": 0.0081, "reward": 2.7080936431884766, "reward_std": 0.8264092803001404, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4737187623977661, "step": 4273 }, { "completion_length": 143.34375, "epoch": 2.286784376672017, "grad_norm": 1.3100028038024902, "kl": 0.22310465574264526, "learning_rate": 8.126508406563682e-07, "loss": 0.0089, "reward": 2.2144999504089355, "reward_std": 0.6909470558166504, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4645000100135803, "step": 4274 }, { "completion_length": 116.40625, "epoch": 2.287319422150883, "grad_norm": 3.01509690284729, "kl": 0.16210830211639404, "learning_rate": 8.115026857128911e-07, "loss": 0.0065, "reward": 2.1139373779296875, "reward_std": 0.6445748805999756, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.47331249713897705, "step": 4275 }, { "completion_length": 140.15625, "epoch": 2.2878544676297485, "grad_norm": 9.125762939453125, "kl": 1.9863337278366089, "learning_rate": 8.10355185263327e-07, "loss": 0.0795, "reward": 1.7824687957763672, "reward_std": 0.65141761302948, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4543437361717224, "step": 4276 }, { "completion_length": 104.0, "epoch": 2.2883895131086143, "grad_norm": 22.101993560791016, "kl": 0.6489449739456177, "learning_rate": 8.092083397524713e-07, "loss": 0.026, "reward": 1.5888125896453857, "reward_std": 0.5307046175003052, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4638125002384186, "step": 4277 }, { "completion_length": 151.5625, "epoch": 2.28892455858748, "grad_norm": 2.3777668476104736, "kl": 0.1863381564617157, "learning_rate": 8.080621496248647e-07, "loss": 0.0075, "reward": 2.1613125801086426, "reward_std": 1.0749576091766357, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4581874907016754, "step": 4278 }, { "completion_length": 142.4375, "epoch": 2.2894596040663457, "grad_norm": 1.250968337059021, "kl": 0.2178456336259842, "learning_rate": 8.069166153247893e-07, "loss": 0.0087, "reward": 1.5461249351501465, "reward_std": 0.44223588705062866, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.42112499475479126, "step": 4279 }, { "completion_length": 130.125, "epoch": 2.2899946495452115, "grad_norm": 3.6827239990234375, "kl": 0.2079298496246338, "learning_rate": 8.057717372962784e-07, "loss": 0.0083, "reward": 1.7730937004089355, "reward_std": 0.8593184351921082, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4605937600135803, "step": 4280 }, { "completion_length": 145.78125, "epoch": 2.290529695024077, "grad_norm": 2.427765130996704, "kl": 0.6289792060852051, "learning_rate": 8.046275159831079e-07, "loss": 0.0252, "reward": 2.019218921661377, "reward_std": 0.9474124908447266, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.3942187428474426, "step": 4281 }, { "completion_length": 117.03125, "epoch": 2.291064740502943, "grad_norm": 1.0030030012130737, "kl": 0.15701207518577576, "learning_rate": 8.034839518287993e-07, "loss": 0.0063, "reward": 2.9428749084472656, "reward_std": 0.5551570057868958, "rewards/correctness_reward_func": 1.5625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4741249978542328, "step": 4282 }, { "completion_length": 135.59375, "epoch": 2.2915997859818082, "grad_norm": 0.592881977558136, "kl": 0.16829384863376617, "learning_rate": 8.023410452766214e-07, "loss": 0.0067, "reward": 1.9565937519073486, "reward_std": 0.8248000741004944, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.45659375190734863, "step": 4283 }, { "completion_length": 132.65625, "epoch": 2.292134831460674, "grad_norm": 0.6136245727539062, "kl": 0.15769805014133453, "learning_rate": 8.011987967695861e-07, "loss": 0.0063, "reward": 2.659156322479248, "reward_std": 0.5204318165779114, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48728126287460327, "step": 4284 }, { "completion_length": 147.125, "epoch": 2.2926698769395397, "grad_norm": 1.4071509838104248, "kl": 0.16384002566337585, "learning_rate": 8.000572067504506e-07, "loss": 0.0066, "reward": 1.9143750667572021, "reward_std": 1.1199207305908203, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.4143750071525574, "step": 4285 }, { "completion_length": 135.5, "epoch": 2.2932049224184055, "grad_norm": 0.5443505644798279, "kl": 0.1838180124759674, "learning_rate": 7.989162756617161e-07, "loss": 0.0074, "reward": 2.7228126525878906, "reward_std": 0.6015003323554993, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4884375035762787, "step": 4286 }, { "completion_length": 121.1875, "epoch": 2.293739967897271, "grad_norm": 0.7718814015388489, "kl": 0.19276973605155945, "learning_rate": 7.977760039456314e-07, "loss": 0.0077, "reward": 2.0683751106262207, "reward_std": 0.44006049633026123, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.49024999141693115, "step": 4287 }, { "completion_length": 142.4375, "epoch": 2.294275013376137, "grad_norm": 5.80596399307251, "kl": 0.38227465748786926, "learning_rate": 7.966363920441863e-07, "loss": 0.0153, "reward": 1.7810311317443848, "reward_std": 0.9801461696624756, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4372812509536743, "step": 4288 }, { "completion_length": 141.3125, "epoch": 2.2948100588550027, "grad_norm": 1.21503484249115, "kl": 0.20240925252437592, "learning_rate": 7.954974403991162e-07, "loss": 0.0081, "reward": 2.103250026702881, "reward_std": 0.8517539501190186, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.40012499690055847, "step": 4289 }, { "completion_length": 130.65625, "epoch": 2.2953451043338684, "grad_norm": 0.9521206021308899, "kl": 0.19919723272323608, "learning_rate": 7.943591494519015e-07, "loss": 0.008, "reward": 1.7569687366485596, "reward_std": 0.9596174955368042, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46009373664855957, "step": 4290 }, { "completion_length": 126.40625, "epoch": 2.295880149812734, "grad_norm": 1.8652020692825317, "kl": 0.2632322311401367, "learning_rate": 7.932215196437648e-07, "loss": 0.0105, "reward": 2.5633749961853027, "reward_std": 1.0593656301498413, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.48524999618530273, "step": 4291 }, { "completion_length": 136.875, "epoch": 2.2964151952916, "grad_norm": 1.2208120822906494, "kl": 0.3385368287563324, "learning_rate": 7.92084551415673e-07, "loss": 0.0135, "reward": 1.5514687299728394, "reward_std": 0.7921051979064941, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.44209372997283936, "step": 4292 }, { "completion_length": 120.0625, "epoch": 2.2969502407704656, "grad_norm": 1.0716345310211182, "kl": 0.1589379608631134, "learning_rate": 7.909482452083378e-07, "loss": 0.0064, "reward": 2.8982186317443848, "reward_std": 0.9199909567832947, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4763437509536743, "step": 4293 }, { "completion_length": 141.53125, "epoch": 2.2974852862493313, "grad_norm": 1.0211318731307983, "kl": 0.15590253472328186, "learning_rate": 7.898126014622127e-07, "loss": 0.0062, "reward": 2.2118749618530273, "reward_std": 1.1704151630401611, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.43062499165534973, "step": 4294 }, { "completion_length": 131.53125, "epoch": 2.298020331728197, "grad_norm": 1.7223684787750244, "kl": 0.14609172940254211, "learning_rate": 7.886776206174948e-07, "loss": 0.0058, "reward": 1.624406337738037, "reward_std": 0.578580379486084, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.42128124833106995, "step": 4295 }, { "completion_length": 137.8125, "epoch": 2.2985553772070624, "grad_norm": 2.70397686958313, "kl": 0.17875435948371887, "learning_rate": 7.875433031141239e-07, "loss": 0.0072, "reward": 2.160031318664551, "reward_std": 0.5691888928413391, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.488156259059906, "step": 4296 }, { "completion_length": 121.25, "epoch": 2.299090422685928, "grad_norm": 0.7708066701889038, "kl": 0.18345513939857483, "learning_rate": 7.864096493917847e-07, "loss": 0.0073, "reward": 3.1297812461853027, "reward_std": 0.6246122121810913, "rewards/correctness_reward_func": 1.6875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48915624618530273, "step": 4297 }, { "completion_length": 147.03125, "epoch": 2.299625468164794, "grad_norm": 2.9220046997070312, "kl": 0.36306583881378174, "learning_rate": 7.852766598899023e-07, "loss": 0.0145, "reward": 1.5588123798370361, "reward_std": 0.9845117330551147, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4025624990463257, "step": 4298 }, { "completion_length": 141.84375, "epoch": 2.3001605136436596, "grad_norm": 1.345081090927124, "kl": 0.17162275314331055, "learning_rate": 7.84144335047644e-07, "loss": 0.0069, "reward": 1.609375, "reward_std": 0.8374333381652832, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.421875, "step": 4299 }, { "completion_length": 156.3125, "epoch": 2.3006955591225253, "grad_norm": 2.8308310508728027, "kl": 0.13637122511863708, "learning_rate": 7.830126753039236e-07, "loss": 0.0055, "reward": 1.8518750667572021, "reward_std": 1.2486252784729004, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4143750071525574, "step": 4300 }, { "completion_length": 144.15625, "epoch": 2.301230604601391, "grad_norm": 1.733865737915039, "kl": 0.15405280888080597, "learning_rate": 7.818816810973903e-07, "loss": 0.0062, "reward": 2.568406343460083, "reward_std": 0.7424933910369873, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.45903125405311584, "step": 4301 }, { "completion_length": 146.25, "epoch": 2.301765650080257, "grad_norm": 1.2731897830963135, "kl": 0.15871867537498474, "learning_rate": 7.807513528664415e-07, "loss": 0.0063, "reward": 1.6814062595367432, "reward_std": 0.9698538780212402, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.41578125953674316, "step": 4302 }, { "completion_length": 144.75, "epoch": 2.3023006955591225, "grad_norm": 0.7536675930023193, "kl": 0.18322807550430298, "learning_rate": 7.796216910492149e-07, "loss": 0.0073, "reward": 1.3136249780654907, "reward_std": 0.9070194959640503, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.3761249780654907, "step": 4303 }, { "completion_length": 136.375, "epoch": 2.3028357410379883, "grad_norm": 0.8624172806739807, "kl": 0.2481328696012497, "learning_rate": 7.784926960835862e-07, "loss": 0.0099, "reward": 1.9091250896453857, "reward_std": 1.0168342590332031, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.4091250002384186, "step": 4304 }, { "completion_length": 104.90625, "epoch": 2.303370786516854, "grad_norm": 1.5641006231307983, "kl": 0.24486617743968964, "learning_rate": 7.77364368407178e-07, "loss": 0.0098, "reward": 2.366374969482422, "reward_std": 0.29051443934440613, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.49137499928474426, "step": 4305 }, { "completion_length": 112.0, "epoch": 2.3039058319957197, "grad_norm": 1.227706789970398, "kl": 0.2581295073032379, "learning_rate": 7.762367084573508e-07, "loss": 0.0103, "reward": 2.147031307220459, "reward_std": 0.6343823671340942, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4751562476158142, "step": 4306 }, { "completion_length": 139.96875, "epoch": 2.3044408774745855, "grad_norm": 0.996004045009613, "kl": 0.19621118903160095, "learning_rate": 7.751097166712066e-07, "loss": 0.0078, "reward": 1.611375093460083, "reward_std": 0.8094197511672974, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.42387497425079346, "step": 4307 }, { "completion_length": 136.875, "epoch": 2.304975922953451, "grad_norm": 1.0340311527252197, "kl": 0.17151367664337158, "learning_rate": 7.7398339348559e-07, "loss": 0.0069, "reward": 2.1017189025878906, "reward_std": 0.8053760528564453, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4142187535762787, "step": 4308 }, { "completion_length": 139.03125, "epoch": 2.3055109684323165, "grad_norm": 1.005553126335144, "kl": 0.21027547121047974, "learning_rate": 7.728577393370854e-07, "loss": 0.0084, "reward": 1.8209375143051147, "reward_std": 0.7888379096984863, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.44593751430511475, "step": 4309 }, { "completion_length": 127.125, "epoch": 2.3060460139111827, "grad_norm": 0.678145706653595, "kl": 0.16471782326698303, "learning_rate": 7.717327546620166e-07, "loss": 0.0066, "reward": 1.7367812395095825, "reward_std": 0.7574442625045776, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4711562395095825, "step": 4310 }, { "completion_length": 164.75, "epoch": 2.306581059390048, "grad_norm": 1.236372947692871, "kl": 0.15374571084976196, "learning_rate": 7.706084398964512e-07, "loss": 0.0061, "reward": 1.4638125896453857, "reward_std": 0.9546927213668823, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.3700625002384186, "step": 4311 }, { "completion_length": 140.0625, "epoch": 2.3071161048689137, "grad_norm": 0.987688422203064, "kl": 0.22527754306793213, "learning_rate": 7.694847954761939e-07, "loss": 0.009, "reward": 2.16015625, "reward_std": 0.8377547264099121, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48828125, "step": 4312 }, { "completion_length": 139.15625, "epoch": 2.3076511503477795, "grad_norm": 0.9655929207801819, "kl": 0.2357361614704132, "learning_rate": 7.683618218367911e-07, "loss": 0.0094, "reward": 1.2189375162124634, "reward_std": 0.37124037742614746, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4533125162124634, "step": 4313 }, { "completion_length": 147.09375, "epoch": 2.308186195826645, "grad_norm": 1.3679264783859253, "kl": 0.17097198963165283, "learning_rate": 7.672395194135282e-07, "loss": 0.0068, "reward": 2.2260937690734863, "reward_std": 1.3251447677612305, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.42921873927116394, "step": 4314 }, { "completion_length": 123.4375, "epoch": 2.308721241305511, "grad_norm": 2.149019241333008, "kl": 0.24945789575576782, "learning_rate": 7.661178886414328e-07, "loss": 0.01, "reward": 1.8427813053131104, "reward_std": 0.9859533309936523, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4677812457084656, "step": 4315 }, { "completion_length": 150.59375, "epoch": 2.3092562867843767, "grad_norm": 2.115771532058716, "kl": 0.2392423450946808, "learning_rate": 7.64996929955269e-07, "loss": 0.0096, "reward": 1.429937481880188, "reward_std": 0.5810599327087402, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.445562481880188, "step": 4316 }, { "completion_length": 125.53125, "epoch": 2.3097913322632424, "grad_norm": 0.8412816524505615, "kl": 0.167097270488739, "learning_rate": 7.63876643789542e-07, "loss": 0.0067, "reward": 2.196906328201294, "reward_std": 1.0025559663772583, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.49378126859664917, "step": 4317 }, { "completion_length": 122.09375, "epoch": 2.310326377742108, "grad_norm": 1.278335690498352, "kl": 0.1550087332725525, "learning_rate": 7.62757030578497e-07, "loss": 0.0062, "reward": 2.70703125, "reward_std": 0.865631103515625, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.48828125, "step": 4318 }, { "completion_length": 128.8125, "epoch": 2.310861423220974, "grad_norm": 0.9248124361038208, "kl": 0.16804654896259308, "learning_rate": 7.616380907561171e-07, "loss": 0.0067, "reward": 2.611781120300293, "reward_std": 1.0463111400604248, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4711562395095825, "step": 4319 }, { "completion_length": 135.34375, "epoch": 2.3113964686998396, "grad_norm": 1.1808756589889526, "kl": 0.15786729753017426, "learning_rate": 7.605198247561241e-07, "loss": 0.0063, "reward": 2.166874885559082, "reward_std": 0.7750226855278015, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4793750047683716, "step": 4320 }, { "completion_length": 121.53125, "epoch": 2.3119315141787053, "grad_norm": 0.8870487809181213, "kl": 0.20587614178657532, "learning_rate": 7.594022330119815e-07, "loss": 0.0082, "reward": 1.4067811965942383, "reward_std": 0.4942282438278198, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.46928122639656067, "step": 4321 }, { "completion_length": 118.3125, "epoch": 2.312466559657571, "grad_norm": 0.8455467820167542, "kl": 0.19404636323451996, "learning_rate": 7.582853159568864e-07, "loss": 0.0078, "reward": 1.9679999351501465, "reward_std": 0.772969663143158, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.49924999475479126, "step": 4322 }, { "completion_length": 122.4375, "epoch": 2.313001605136437, "grad_norm": 1.0077769756317139, "kl": 0.20182207226753235, "learning_rate": 7.571690740237792e-07, "loss": 0.0081, "reward": 2.163968563079834, "reward_std": 0.9209381341934204, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4764687418937683, "step": 4323 }, { "completion_length": 113.8125, "epoch": 2.313536650615302, "grad_norm": 2.1615982055664062, "kl": 0.2820151448249817, "learning_rate": 7.560535076453363e-07, "loss": 0.0113, "reward": 2.676281213760376, "reward_std": 0.5815967917442322, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48878124356269836, "step": 4324 }, { "completion_length": 130.9375, "epoch": 2.314071696094168, "grad_norm": 1.2373017072677612, "kl": 0.19761866331100464, "learning_rate": 7.549386172539716e-07, "loss": 0.0079, "reward": 1.9522500038146973, "reward_std": 1.043859601020813, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.43662500381469727, "step": 4325 }, { "completion_length": 130.15625, "epoch": 2.3146067415730336, "grad_norm": 43.63889694213867, "kl": 1.1856954097747803, "learning_rate": 7.538244032818393e-07, "loss": 0.0474, "reward": 1.8765312433242798, "reward_std": 0.8681915998458862, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4390312433242798, "step": 4326 }, { "completion_length": 105.96875, "epoch": 2.3151417870518993, "grad_norm": 1.9954532384872437, "kl": 0.3822976350784302, "learning_rate": 7.527108661608298e-07, "loss": 0.0153, "reward": 2.7145938873291016, "reward_std": 1.0223722457885742, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.480218768119812, "step": 4327 }, { "completion_length": 147.0625, "epoch": 2.315676832530765, "grad_norm": 1.6599246263504028, "kl": 0.19425874948501587, "learning_rate": 7.515980063225711e-07, "loss": 0.0078, "reward": 1.749343752861023, "reward_std": 0.6380414366722107, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.35871875286102295, "step": 4328 }, { "completion_length": 107.40625, "epoch": 2.316211878009631, "grad_norm": 0.8745077252388, "kl": 0.19430838525295258, "learning_rate": 7.504858241984287e-07, "loss": 0.0078, "reward": 2.4528751373291016, "reward_std": 0.5283876061439514, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.499750018119812, "step": 4329 }, { "completion_length": 140.40625, "epoch": 2.3167469234884965, "grad_norm": 1.2438899278640747, "kl": 0.2645862102508545, "learning_rate": 7.493743202195064e-07, "loss": 0.0106, "reward": 1.4699063301086426, "reward_std": 0.6297311782836914, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4699062407016754, "step": 4330 }, { "completion_length": 120.375, "epoch": 2.3172819689673623, "grad_norm": 1.2120702266693115, "kl": 0.18134921789169312, "learning_rate": 7.482634948166442e-07, "loss": 0.0073, "reward": 2.34375, "reward_std": 0.49121132493019104, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.5, "step": 4331 }, { "completion_length": 124.46875, "epoch": 2.317817014446228, "grad_norm": 0.877578616142273, "kl": 0.15160638093948364, "learning_rate": 7.471533484204189e-07, "loss": 0.0061, "reward": 1.8199687004089355, "reward_std": 0.6331830024719238, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46059373021125793, "step": 4332 }, { "completion_length": 134.21875, "epoch": 2.3183520599250937, "grad_norm": 1.4286775588989258, "kl": 0.175519660115242, "learning_rate": 7.460438814611451e-07, "loss": 0.007, "reward": 1.8709688186645508, "reward_std": 0.49315062165260315, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4490937292575836, "step": 4333 }, { "completion_length": 120.5, "epoch": 2.3188871054039595, "grad_norm": 1.3855528831481934, "kl": 0.2056787610054016, "learning_rate": 7.449350943688735e-07, "loss": 0.0082, "reward": 2.4090938568115234, "reward_std": 0.8535758852958679, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4715937376022339, "step": 4334 }, { "completion_length": 135.65625, "epoch": 2.319422150882825, "grad_norm": 3.073577404022217, "kl": 0.3009577989578247, "learning_rate": 7.438269875733903e-07, "loss": 0.012, "reward": 1.6866562366485596, "reward_std": 0.8376275300979614, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.43665623664855957, "step": 4335 }, { "completion_length": 125.34375, "epoch": 2.319957196361691, "grad_norm": 0.8237230181694031, "kl": 0.15960314869880676, "learning_rate": 7.427195615042201e-07, "loss": 0.0064, "reward": 2.7070937156677246, "reward_std": 0.8282707333564758, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.472718745470047, "step": 4336 }, { "completion_length": 104.15625, "epoch": 2.3204922418405562, "grad_norm": 1.086273193359375, "kl": 0.3810378611087799, "learning_rate": 7.416128165906222e-07, "loss": 0.0152, "reward": 2.1349687576293945, "reward_std": 0.8253083229064941, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46309375762939453, "step": 4337 }, { "completion_length": 151.46875, "epoch": 2.321027287319422, "grad_norm": 1.1521997451782227, "kl": 0.21898794174194336, "learning_rate": 7.405067532615917e-07, "loss": 0.0088, "reward": 1.8565938472747803, "reward_std": 1.0552797317504883, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4347187280654907, "step": 4338 }, { "completion_length": 127.53125, "epoch": 2.3215623327982877, "grad_norm": 1.1146882772445679, "kl": 0.18729302287101746, "learning_rate": 7.394013719458592e-07, "loss": 0.0075, "reward": 1.9136250019073486, "reward_std": 0.7133871912956238, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.49175000190734863, "step": 4339 }, { "completion_length": 110.46875, "epoch": 2.3220973782771535, "grad_norm": 7.344454765319824, "kl": 0.16297100484371185, "learning_rate": 7.382966730718938e-07, "loss": 0.0065, "reward": 2.90234375, "reward_std": 0.726251482963562, "rewards/correctness_reward_func": 1.5625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.48046875, "step": 4340 }, { "completion_length": 142.46875, "epoch": 2.322632423756019, "grad_norm": 2.6654529571533203, "kl": 0.15930593013763428, "learning_rate": 7.371926570678964e-07, "loss": 0.0064, "reward": 1.3282500505447388, "reward_std": 0.8896290063858032, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.203125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.4220000207424164, "step": 4341 }, { "completion_length": 111.90625, "epoch": 2.323167469234885, "grad_norm": 1.313991904258728, "kl": 0.19208788871765137, "learning_rate": 7.360893243618045e-07, "loss": 0.0077, "reward": 2.789750099182129, "reward_std": 0.7635023593902588, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.47725000977516174, "step": 4342 }, { "completion_length": 101.40625, "epoch": 2.3237025147137507, "grad_norm": 0.8215247988700867, "kl": 0.21175292134284973, "learning_rate": 7.349866753812932e-07, "loss": 0.0085, "reward": 2.920656204223633, "reward_std": 0.8186960220336914, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4831562638282776, "step": 4343 }, { "completion_length": 148.65625, "epoch": 2.3242375601926164, "grad_norm": 0.7768461108207703, "kl": 0.18076342344284058, "learning_rate": 7.338847105537672e-07, "loss": 0.0072, "reward": 2.1665313243865967, "reward_std": 0.9203556776046753, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4321562349796295, "step": 4344 }, { "completion_length": 134.5, "epoch": 2.324772605671482, "grad_norm": 0.8808384537696838, "kl": 0.22210311889648438, "learning_rate": 7.327834303063708e-07, "loss": 0.0089, "reward": 2.1002187728881836, "reward_std": 1.0547401905059814, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4595937430858612, "step": 4345 }, { "completion_length": 136.15625, "epoch": 2.325307651150348, "grad_norm": 0.7702898383140564, "kl": 0.1836632490158081, "learning_rate": 7.316828350659827e-07, "loss": 0.0073, "reward": 2.206906318664551, "reward_std": 0.7089928388595581, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.456906259059906, "step": 4346 }, { "completion_length": 151.3125, "epoch": 2.3258426966292136, "grad_norm": 1.0991663932800293, "kl": 0.16517581045627594, "learning_rate": 7.30582925259212e-07, "loss": 0.0066, "reward": 1.7174687385559082, "reward_std": 1.0786991119384766, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4362187385559082, "step": 4347 }, { "completion_length": 143.625, "epoch": 2.3263777421080793, "grad_norm": 1.5926131010055542, "kl": 0.16750316321849823, "learning_rate": 7.294837013124059e-07, "loss": 0.0067, "reward": 1.751406192779541, "reward_std": 0.8314507603645325, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4701562523841858, "step": 4348 }, { "completion_length": 107.8125, "epoch": 2.326912787586945, "grad_norm": 0.6185325384140015, "kl": 0.1794288009405136, "learning_rate": 7.283851636516464e-07, "loss": 0.0072, "reward": 2.0308125019073486, "reward_std": 0.23289678990840912, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.49956250190734863, "step": 4349 }, { "completion_length": 146.96875, "epoch": 2.3274478330658104, "grad_norm": 0.7129053473472595, "kl": 0.18968424201011658, "learning_rate": 7.27287312702745e-07, "loss": 0.0076, "reward": 1.4644062519073486, "reward_std": 0.6665745377540588, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.43315625190734863, "step": 4350 }, { "completion_length": 173.59375, "epoch": 2.327982878544676, "grad_norm": 0.7966206669807434, "kl": 0.14276273548603058, "learning_rate": 7.261901488912515e-07, "loss": 0.0057, "reward": 1.3861874341964722, "reward_std": 1.0555566549301147, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.28125, "rewards/xmlcount_reward_func": 0.37056249380111694, "step": 4351 }, { "completion_length": 109.0, "epoch": 2.328517924023542, "grad_norm": 1.2961881160736084, "kl": 0.2150927186012268, "learning_rate": 7.250936726424474e-07, "loss": 0.0086, "reward": 3.0409064292907715, "reward_std": 0.8372948169708252, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47840625047683716, "step": 4352 }, { "completion_length": 137.28125, "epoch": 2.3290529695024076, "grad_norm": 1.3521888256072998, "kl": 0.16629043221473694, "learning_rate": 7.239978843813466e-07, "loss": 0.0067, "reward": 2.0341875553131104, "reward_std": 0.9108121395111084, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4716874957084656, "step": 4353 }, { "completion_length": 156.59375, "epoch": 2.3295880149812733, "grad_norm": 0.8141500949859619, "kl": 0.10794143378734589, "learning_rate": 7.229027845326994e-07, "loss": 0.0043, "reward": 1.9149062633514404, "reward_std": 0.6440268158912659, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.44615626335144043, "step": 4354 }, { "completion_length": 145.75, "epoch": 2.330123060460139, "grad_norm": 1.4466947317123413, "kl": 0.4395006000995636, "learning_rate": 7.218083735209869e-07, "loss": 0.0176, "reward": 2.012125015258789, "reward_std": 1.1997618675231934, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.43400001525878906, "step": 4355 }, { "completion_length": 128.4375, "epoch": 2.330658105939005, "grad_norm": 1454.3931884765625, "kl": 201.48500061035156, "learning_rate": 7.207146517704236e-07, "loss": 8.0594, "reward": 2.4670937061309814, "reward_std": 0.8140889406204224, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4827187657356262, "step": 4356 }, { "completion_length": 151.5625, "epoch": 2.3311931514178705, "grad_norm": 1.8701590299606323, "kl": 0.28282514214515686, "learning_rate": 7.19621619704956e-07, "loss": 0.0113, "reward": 1.4142500162124634, "reward_std": 0.7968218326568604, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.429874986410141, "step": 4357 }, { "completion_length": 129.8125, "epoch": 2.3317281968967363, "grad_norm": 0.7584244608879089, "kl": 0.19137614965438843, "learning_rate": 7.185292777482664e-07, "loss": 0.0077, "reward": 2.3653438091278076, "reward_std": 0.8316782712936401, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47471874952316284, "step": 4358 }, { "completion_length": 148.34375, "epoch": 2.332263242375602, "grad_norm": 3.9512124061584473, "kl": 0.32340604066848755, "learning_rate": 7.174376263237665e-07, "loss": 0.0129, "reward": 1.4352188110351562, "reward_std": 0.8216946125030518, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4352187514305115, "step": 4359 }, { "completion_length": 133.03125, "epoch": 2.3327982878544677, "grad_norm": 1.1539136171340942, "kl": 0.13492996990680695, "learning_rate": 7.163466658546e-07, "loss": 0.0054, "reward": 2.1787188053131104, "reward_std": 0.4939727783203125, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4287187457084656, "step": 4360 }, { "completion_length": 120.78125, "epoch": 2.3333333333333335, "grad_norm": 1.4339048862457275, "kl": 0.1665249764919281, "learning_rate": 7.152563967636464e-07, "loss": 0.0067, "reward": 1.395687460899353, "reward_std": 0.4538652300834656, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4581875205039978, "step": 4361 }, { "completion_length": 130.9375, "epoch": 2.333868378812199, "grad_norm": 139.96282958984375, "kl": 0.5222111940383911, "learning_rate": 7.141668194735135e-07, "loss": 0.0209, "reward": 2.3944687843322754, "reward_std": 0.8701120018959045, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.441343754529953, "step": 4362 }, { "completion_length": 140.8125, "epoch": 2.3344034242910645, "grad_norm": 0.7516850829124451, "kl": 0.1711818426847458, "learning_rate": 7.130779344065419e-07, "loss": 0.0068, "reward": 1.8719375133514404, "reward_std": 0.7634948492050171, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46568751335144043, "step": 4363 }, { "completion_length": 119.625, "epoch": 2.3349384697699307, "grad_norm": 0.6076242327690125, "kl": 0.2054060995578766, "learning_rate": 7.11989741984806e-07, "loss": 0.0082, "reward": 2.773843765258789, "reward_std": 0.8165605068206787, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4769687354564667, "step": 4364 }, { "completion_length": 122.25, "epoch": 2.335473515248796, "grad_norm": 0.8113231658935547, "kl": 0.17117877304553986, "learning_rate": 7.109022426301088e-07, "loss": 0.0068, "reward": 2.9402499198913574, "reward_std": 0.4252464175224304, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4871249794960022, "step": 4365 }, { "completion_length": 158.75, "epoch": 2.3360085607276617, "grad_norm": 0.43617895245552063, "kl": 0.11122678965330124, "learning_rate": 7.098154367639862e-07, "loss": 0.0044, "reward": 1.5935624837875366, "reward_std": 1.006730556488037, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.3591874837875366, "step": 4366 }, { "completion_length": 117.9375, "epoch": 2.3365436062065275, "grad_norm": 0.9154930710792542, "kl": 0.15866754949092865, "learning_rate": 7.087293248077043e-07, "loss": 0.0063, "reward": 2.4589061737060547, "reward_std": 0.7597546577453613, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.47453126311302185, "step": 4367 }, { "completion_length": 111.59375, "epoch": 2.337078651685393, "grad_norm": 0.5919054746627808, "kl": 0.15763625502586365, "learning_rate": 7.076439071822622e-07, "loss": 0.0063, "reward": 2.715156316757202, "reward_std": 0.5276731848716736, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4807812571525574, "step": 4368 }, { "completion_length": 133.6875, "epoch": 2.337613697164259, "grad_norm": 0.6363193988800049, "kl": 0.19022637605667114, "learning_rate": 7.065591843083878e-07, "loss": 0.0076, "reward": 2.0356249809265137, "reward_std": 0.4242205321788788, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47312498092651367, "step": 4369 }, { "completion_length": 147.1875, "epoch": 2.3381487426431247, "grad_norm": 0.9098653197288513, "kl": 0.14417824149131775, "learning_rate": 7.054751566065399e-07, "loss": 0.0058, "reward": 1.28725004196167, "reward_std": 0.6491769552230835, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.42787501215934753, "step": 4370 }, { "completion_length": 122.28125, "epoch": 2.3386837881219904, "grad_norm": 0.5380527377128601, "kl": 0.14799469709396362, "learning_rate": 7.043918244969106e-07, "loss": 0.0059, "reward": 2.02734375, "reward_std": 0.6158784031867981, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.46484375, "step": 4371 }, { "completion_length": 122.09375, "epoch": 2.339218833600856, "grad_norm": 2.1219654083251953, "kl": 0.4580954909324646, "learning_rate": 7.033091883994172e-07, "loss": 0.0183, "reward": 1.9399375915527344, "reward_std": 0.7330459356307983, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4711874723434448, "step": 4372 }, { "completion_length": 129.34375, "epoch": 2.339753879079722, "grad_norm": 0.9106993079185486, "kl": 0.1777772307395935, "learning_rate": 7.022272487337123e-07, "loss": 0.0071, "reward": 2.5804686546325684, "reward_std": 0.6994777917861938, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4710937440395355, "step": 4373 }, { "completion_length": 142.46875, "epoch": 2.3402889245585876, "grad_norm": 0.8539438843727112, "kl": 0.18097805976867676, "learning_rate": 7.011460059191761e-07, "loss": 0.0072, "reward": 2.006218671798706, "reward_std": 0.81369549036026, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.45934373140335083, "step": 4374 }, { "completion_length": 120.3125, "epoch": 2.3408239700374533, "grad_norm": 1.2748910188674927, "kl": 0.19492672383785248, "learning_rate": 7.000654603749177e-07, "loss": 0.0078, "reward": 1.4152812957763672, "reward_std": 0.5201476812362671, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4621562361717224, "step": 4375 }, { "completion_length": 160.3125, "epoch": 2.3413590155163186, "grad_norm": 0.8027102947235107, "kl": 0.1413160264492035, "learning_rate": 6.989856125197791e-07, "loss": 0.0057, "reward": 1.4253125190734863, "reward_std": 0.7131938338279724, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.42531248927116394, "step": 4376 }, { "completion_length": 123.15625, "epoch": 2.341894060995185, "grad_norm": 0.951852023601532, "kl": 0.2103334367275238, "learning_rate": 6.979064627723293e-07, "loss": 0.0084, "reward": 2.117687702178955, "reward_std": 1.04258131980896, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.49268749356269836, "step": 4377 }, { "completion_length": 136.21875, "epoch": 2.34242910647405, "grad_norm": 0.6116722226142883, "kl": 0.21717704832553864, "learning_rate": 6.968280115508666e-07, "loss": 0.0087, "reward": 2.092156171798706, "reward_std": 0.6575169563293457, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4515312612056732, "step": 4378 }, { "completion_length": 133.9375, "epoch": 2.342964151952916, "grad_norm": 0.8889856338500977, "kl": 0.14840584993362427, "learning_rate": 6.957502592734208e-07, "loss": 0.0059, "reward": 2.3515625, "reward_std": 1.0539138317108154, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4765625, "step": 4379 }, { "completion_length": 141.375, "epoch": 2.3434991974317816, "grad_norm": 0.7502126097679138, "kl": 0.13682639598846436, "learning_rate": 6.946732063577488e-07, "loss": 0.0055, "reward": 2.1566250324249268, "reward_std": 1.1242046356201172, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.43787500262260437, "step": 4380 }, { "completion_length": 125.28125, "epoch": 2.3440342429106473, "grad_norm": 0.7937538623809814, "kl": 0.20173734426498413, "learning_rate": 6.935968532213366e-07, "loss": 0.0081, "reward": 2.0314688682556152, "reward_std": 0.6753557920455933, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4220937192440033, "step": 4381 }, { "completion_length": 106.78125, "epoch": 2.344569288389513, "grad_norm": 0.8708968758583069, "kl": 0.18337658047676086, "learning_rate": 6.925212002813988e-07, "loss": 0.0073, "reward": 2.828125, "reward_std": 0.4400395154953003, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.5, "step": 4382 }, { "completion_length": 134.46875, "epoch": 2.345104333868379, "grad_norm": 0.9510000348091125, "kl": 0.1516772359609604, "learning_rate": 6.914462479548806e-07, "loss": 0.0061, "reward": 1.8907499313354492, "reward_std": 0.7365895509719849, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.453249990940094, "step": 4383 }, { "completion_length": 154.84375, "epoch": 2.3456393793472445, "grad_norm": 0.3907926678657532, "kl": 0.120796337723732, "learning_rate": 6.903719966584535e-07, "loss": 0.0048, "reward": 1.3871874809265137, "reward_std": 0.6711090803146362, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.41843751072883606, "step": 4384 }, { "completion_length": 130.65625, "epoch": 2.3461744248261103, "grad_norm": 1.0029447078704834, "kl": 0.2200019210577011, "learning_rate": 6.892984468085164e-07, "loss": 0.0088, "reward": 1.7058124542236328, "reward_std": 0.5897746086120605, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4714374840259552, "step": 4385 }, { "completion_length": 151.21875, "epoch": 2.346709470304976, "grad_norm": 2.5618038177490234, "kl": 0.27607840299606323, "learning_rate": 6.882255988212003e-07, "loss": 0.011, "reward": 2.221750020980835, "reward_std": 1.0494983196258545, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4717499911785126, "step": 4386 }, { "completion_length": 124.53125, "epoch": 2.3472445157838417, "grad_norm": 6.04926872253418, "kl": 0.40513402223587036, "learning_rate": 6.871534531123605e-07, "loss": 0.0162, "reward": 2.6346561908721924, "reward_std": 0.5779414772987366, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47840625047683716, "step": 4387 }, { "completion_length": 127.625, "epoch": 2.3477795612627075, "grad_norm": 1.19019615650177, "kl": 0.21757331490516663, "learning_rate": 6.860820100975802e-07, "loss": 0.0087, "reward": 2.253593921661377, "reward_std": 0.7982726693153381, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.472343772649765, "step": 4388 }, { "completion_length": 160.25, "epoch": 2.348314606741573, "grad_norm": 9.747956275939941, "kl": 0.5063633918762207, "learning_rate": 6.850112701921735e-07, "loss": 0.0203, "reward": 1.4685626029968262, "reward_std": 0.7333015203475952, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4216874837875366, "step": 4389 }, { "completion_length": 142.65625, "epoch": 2.348849652220439, "grad_norm": 0.5864468216896057, "kl": 0.17250478267669678, "learning_rate": 6.839412338111773e-07, "loss": 0.0069, "reward": 2.149250030517578, "reward_std": 0.8491784334182739, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.46175000071525574, "step": 4390 }, { "completion_length": 147.5625, "epoch": 2.3493846976993042, "grad_norm": 0.862949788570404, "kl": 0.1896008849143982, "learning_rate": 6.828719013693591e-07, "loss": 0.0076, "reward": 1.5139687061309814, "reward_std": 0.4504128694534302, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4827187657356262, "step": 4391 }, { "completion_length": 129.96875, "epoch": 2.34991974317817, "grad_norm": 1.818941354751587, "kl": 0.23024141788482666, "learning_rate": 6.818032732812144e-07, "loss": 0.0092, "reward": 2.4081249237060547, "reward_std": 0.6302258968353271, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47062498331069946, "step": 4392 }, { "completion_length": 142.59375, "epoch": 2.3504547886570357, "grad_norm": 0.8953051567077637, "kl": 0.16581830382347107, "learning_rate": 6.807353499609606e-07, "loss": 0.0066, "reward": 1.6789686679840088, "reward_std": 1.0784692764282227, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.44459375739097595, "step": 4393 }, { "completion_length": 122.375, "epoch": 2.3509898341359015, "grad_norm": 9.01475715637207, "kl": 0.47137022018432617, "learning_rate": 6.796681318225481e-07, "loss": 0.0189, "reward": 2.0229687690734863, "reward_std": 0.7565284967422485, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47609376907348633, "step": 4394 }, { "completion_length": 126.34375, "epoch": 2.351524879614767, "grad_norm": 1.022412896156311, "kl": 0.2175682634115219, "learning_rate": 6.786016192796499e-07, "loss": 0.0087, "reward": 2.071812391281128, "reward_std": 0.953402042388916, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4468125104904175, "step": 4395 }, { "completion_length": 131.3125, "epoch": 2.352059925093633, "grad_norm": 0.7364386916160583, "kl": 0.17679256200790405, "learning_rate": 6.77535812745666e-07, "loss": 0.0071, "reward": 2.2317187786102295, "reward_std": 0.817136824131012, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4817187488079071, "step": 4396 }, { "completion_length": 117.875, "epoch": 2.3525949705724987, "grad_norm": 1.0920802354812622, "kl": 0.16372516751289368, "learning_rate": 6.764707126337252e-07, "loss": 0.0065, "reward": 2.8593125343322754, "reward_std": 0.39772236347198486, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.499937504529953, "step": 4397 }, { "completion_length": 108.875, "epoch": 2.3531300160513644, "grad_norm": 1.023676872253418, "kl": 0.22242647409439087, "learning_rate": 6.754063193566798e-07, "loss": 0.0089, "reward": 2.7331252098083496, "reward_std": 0.8145143985748291, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4675000011920929, "step": 4398 }, { "completion_length": 123.0, "epoch": 2.35366506153023, "grad_norm": 0.8499074578285217, "kl": 0.2806616723537445, "learning_rate": 6.743426333271089e-07, "loss": 0.0112, "reward": 2.565281391143799, "reward_std": 1.1154839992523193, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4559062421321869, "step": 4399 }, { "completion_length": 133.21875, "epoch": 2.354200107009096, "grad_norm": 1.2699544429779053, "kl": 0.23553678393363953, "learning_rate": 6.732796549573167e-07, "loss": 0.0094, "reward": 1.733875036239624, "reward_std": 0.48380282521247864, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.48387500643730164, "step": 4400 }, { "completion_length": 136.53125, "epoch": 2.3547351524879616, "grad_norm": 1.3569772243499756, "kl": 0.26860684156417847, "learning_rate": 6.72217384659336e-07, "loss": 0.0107, "reward": 1.7475625276565552, "reward_std": 0.7574553489685059, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4506875276565552, "step": 4401 }, { "completion_length": 147.96875, "epoch": 2.3552701979668274, "grad_norm": 0.6142012476921082, "kl": 0.16253530979156494, "learning_rate": 6.711558228449222e-07, "loss": 0.0065, "reward": 2.058406352996826, "reward_std": 0.7617741823196411, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.449031263589859, "step": 4402 }, { "completion_length": 145.34375, "epoch": 2.355805243445693, "grad_norm": 1.028565526008606, "kl": 0.18408292531967163, "learning_rate": 6.70094969925556e-07, "loss": 0.0074, "reward": 2.3380937576293945, "reward_std": 0.7022068500518799, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.41621875762939453, "step": 4403 }, { "completion_length": 140.78125, "epoch": 2.3563402889245584, "grad_norm": 82.38463592529297, "kl": 6.0440449714660645, "learning_rate": 6.690348263124458e-07, "loss": 0.2418, "reward": 1.9940624237060547, "reward_std": 0.6260532140731812, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46281251311302185, "step": 4404 }, { "completion_length": 136.78125, "epoch": 2.356875334403424, "grad_norm": 0.9742981791496277, "kl": 0.20801536738872528, "learning_rate": 6.679753924165234e-07, "loss": 0.0083, "reward": 1.3918125629425049, "reward_std": 0.5365449786186218, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4543125033378601, "step": 4405 }, { "completion_length": 131.03125, "epoch": 2.35741037988229, "grad_norm": 1.5532352924346924, "kl": 0.2863202691078186, "learning_rate": 6.669166686484441e-07, "loss": 0.0115, "reward": 1.8319063186645508, "reward_std": 0.3571707606315613, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.488156259059906, "step": 4406 }, { "completion_length": 111.5625, "epoch": 2.3579454253611556, "grad_norm": 0.9881210923194885, "kl": 0.2590818703174591, "learning_rate": 6.658586554185917e-07, "loss": 0.0104, "reward": 2.6919686794281006, "reward_std": 0.9671859741210938, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48884373903274536, "step": 4407 }, { "completion_length": 120.25, "epoch": 2.3584804708400213, "grad_norm": 1.4223594665527344, "kl": 0.34260499477386475, "learning_rate": 6.648013531370712e-07, "loss": 0.0137, "reward": 1.8413437604904175, "reward_std": 0.3817945718765259, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4663437604904175, "step": 4408 }, { "completion_length": 94.34375, "epoch": 2.359015516318887, "grad_norm": 1.1456570625305176, "kl": 0.25587183237075806, "learning_rate": 6.637447622137136e-07, "loss": 0.0102, "reward": 2.640625, "reward_std": 0.909710168838501, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.5, "step": 4409 }, { "completion_length": 144.0, "epoch": 2.359550561797753, "grad_norm": 1.0563489198684692, "kl": 0.15424349904060364, "learning_rate": 6.62688883058073e-07, "loss": 0.0062, "reward": 2.0219688415527344, "reward_std": 0.7685927748680115, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.3969687521457672, "step": 4410 }, { "completion_length": 143.40625, "epoch": 2.3600856072766185, "grad_norm": 1.0632718801498413, "kl": 0.22528180480003357, "learning_rate": 6.616337160794295e-07, "loss": 0.009, "reward": 0.9772500395774841, "reward_std": 0.45732900500297546, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.38350000977516174, "step": 4411 }, { "completion_length": 116.09375, "epoch": 2.3606206527554843, "grad_norm": 0.7275471687316895, "kl": 0.1700557917356491, "learning_rate": 6.605792616867857e-07, "loss": 0.0068, "reward": 2.098781108856201, "reward_std": 0.42041489481925964, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4737812280654907, "step": 4412 }, { "completion_length": 123.84375, "epoch": 2.36115569823435, "grad_norm": 1.3826559782028198, "kl": 0.22015780210494995, "learning_rate": 6.595255202888673e-07, "loss": 0.0088, "reward": 2.648124933242798, "reward_std": 0.7152921557426453, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4606249928474426, "step": 4413 }, { "completion_length": 105.28125, "epoch": 2.3616907437132157, "grad_norm": 0.6450918912887573, "kl": 0.24068468809127808, "learning_rate": 6.584724922941271e-07, "loss": 0.0096, "reward": 2.890500068664551, "reward_std": 0.6581834554672241, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.499875009059906, "step": 4414 }, { "completion_length": 115.0, "epoch": 2.3622257891920815, "grad_norm": 0.8600559830665588, "kl": 0.21069292724132538, "learning_rate": 6.574201781107359e-07, "loss": 0.0084, "reward": 2.602874994277954, "reward_std": 0.8799595832824707, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4778749942779541, "step": 4415 }, { "completion_length": 120.8125, "epoch": 2.362760834670947, "grad_norm": 1.7312004566192627, "kl": 0.17259740829467773, "learning_rate": 6.563685781465923e-07, "loss": 0.0069, "reward": 1.84765625, "reward_std": 0.7265638709068298, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48828125, "step": 4416 }, { "completion_length": 131.8125, "epoch": 2.3632958801498125, "grad_norm": 1.1793880462646484, "kl": 0.2353440523147583, "learning_rate": 6.55317692809318e-07, "loss": 0.0094, "reward": 2.0526561737060547, "reward_std": 0.9737073183059692, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.44328126311302185, "step": 4417 }, { "completion_length": 129.03125, "epoch": 2.3638309256286782, "grad_norm": 0.9314664006233215, "kl": 0.17254789173603058, "learning_rate": 6.542675225062537e-07, "loss": 0.0069, "reward": 1.8251562118530273, "reward_std": 0.7122702598571777, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.45015624165534973, "step": 4418 }, { "completion_length": 143.21875, "epoch": 2.364365971107544, "grad_norm": 1.274125576019287, "kl": 0.16937966644763947, "learning_rate": 6.532180676444663e-07, "loss": 0.0068, "reward": 1.58384370803833, "reward_std": 0.7824000716209412, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.45884376764297485, "step": 4419 }, { "completion_length": 137.0, "epoch": 2.3649010165864097, "grad_norm": 0.7878479361534119, "kl": 0.14888066053390503, "learning_rate": 6.521693286307468e-07, "loss": 0.006, "reward": 2.2174999713897705, "reward_std": 0.7884659767150879, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4675000011920929, "step": 4420 }, { "completion_length": 125.875, "epoch": 2.3654360620652755, "grad_norm": 0.8316420316696167, "kl": 0.20857812464237213, "learning_rate": 6.511213058716031e-07, "loss": 0.0083, "reward": 2.4716875553131104, "reward_std": 0.7888354063034058, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4873124957084656, "step": 4421 }, { "completion_length": 123.34375, "epoch": 2.365971107544141, "grad_norm": 1.2532451152801514, "kl": 0.19671767950057983, "learning_rate": 6.500739997732717e-07, "loss": 0.0079, "reward": 1.9900624752044678, "reward_std": 0.6987351775169373, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.49006250500679016, "step": 4422 }, { "completion_length": 132.75, "epoch": 2.366506153023007, "grad_norm": 19.6894588470459, "kl": 0.32205715775489807, "learning_rate": 6.490274107417072e-07, "loss": 0.0129, "reward": 2.49581241607666, "reward_std": 0.8092949390411377, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4489375054836273, "step": 4423 }, { "completion_length": 127.875, "epoch": 2.3670411985018727, "grad_norm": 2.3782992362976074, "kl": 0.21997402608394623, "learning_rate": 6.47981539182587e-07, "loss": 0.0088, "reward": 2.2417500019073486, "reward_std": 1.1716428995132446, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47612500190734863, "step": 4424 }, { "completion_length": 107.71875, "epoch": 2.3675762439807384, "grad_norm": 9.528300285339355, "kl": 2.000844717025757, "learning_rate": 6.469363855013125e-07, "loss": 0.08, "reward": 2.499875068664551, "reward_std": 0.44276291131973267, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.499875009059906, "step": 4425 }, { "completion_length": 129.0625, "epoch": 2.368111289459604, "grad_norm": 1.0124415159225464, "kl": 0.19741959869861603, "learning_rate": 6.458919501030044e-07, "loss": 0.0079, "reward": 2.32393741607666, "reward_std": 0.7410539388656616, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4645625054836273, "step": 4426 }, { "completion_length": 134.6875, "epoch": 2.36864633493847, "grad_norm": 0.6515032052993774, "kl": 0.2075447142124176, "learning_rate": 6.448482333925063e-07, "loss": 0.0083, "reward": 2.1323750019073486, "reward_std": 0.5577204823493958, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.42925000190734863, "step": 4427 }, { "completion_length": 127.6875, "epoch": 2.3691813804173356, "grad_norm": 1.0112941265106201, "kl": 0.17181673645973206, "learning_rate": 6.438052357743816e-07, "loss": 0.0069, "reward": 2.2872188091278076, "reward_std": 0.8875473737716675, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.47471874952316284, "step": 4428 }, { "completion_length": 152.40625, "epoch": 2.3697164258962014, "grad_norm": 0.8815793991088867, "kl": 0.16311579942703247, "learning_rate": 6.427629576529177e-07, "loss": 0.0065, "reward": 1.009812593460083, "reward_std": 0.37936413288116455, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.41606253385543823, "step": 4429 }, { "completion_length": 154.59375, "epoch": 2.3702514713750666, "grad_norm": 1.4524861574172974, "kl": 0.2547781765460968, "learning_rate": 6.417213994321209e-07, "loss": 0.0102, "reward": 1.5859375, "reward_std": 0.6821790933609009, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4765625, "step": 4430 }, { "completion_length": 138.46875, "epoch": 2.370786516853933, "grad_norm": 0.8737033605575562, "kl": 0.17358560860157013, "learning_rate": 6.406805615157185e-07, "loss": 0.0069, "reward": 2.1174373626708984, "reward_std": 1.1480872631072998, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4299375116825104, "step": 4431 }, { "completion_length": 126.1875, "epoch": 2.371321562332798, "grad_norm": 1.2125474214553833, "kl": 0.2585625648498535, "learning_rate": 6.396404443071608e-07, "loss": 0.0103, "reward": 1.3224375247955322, "reward_std": 0.2869442105293274, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.47868749499320984, "step": 4432 }, { "completion_length": 118.1875, "epoch": 2.371856607811664, "grad_norm": 0.6332758665084839, "kl": 0.19770069420337677, "learning_rate": 6.386010482096158e-07, "loss": 0.0079, "reward": 2.5199999809265137, "reward_std": 0.7611976265907288, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.45750001072883606, "step": 4433 }, { "completion_length": 133.875, "epoch": 2.3723916532905296, "grad_norm": 2.414437770843506, "kl": 0.19484584033489227, "learning_rate": 6.375623736259734e-07, "loss": 0.0078, "reward": 2.3203125, "reward_std": 1.0158307552337646, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4765625, "step": 4434 }, { "completion_length": 134.15625, "epoch": 2.3729266987693953, "grad_norm": 678.5787353515625, "kl": 5.167669773101807, "learning_rate": 6.365244209588456e-07, "loss": 0.2067, "reward": 1.5619375705718994, "reward_std": 0.6241620779037476, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46818751096725464, "step": 4435 }, { "completion_length": 123.25, "epoch": 2.373461744248261, "grad_norm": 0.9633941054344177, "kl": 0.1625904142856598, "learning_rate": 6.354871906105598e-07, "loss": 0.0065, "reward": 2.3231873512268066, "reward_std": 0.9432311654090881, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4638125002384186, "step": 4436 }, { "completion_length": 137.84375, "epoch": 2.373996789727127, "grad_norm": 1.3126170635223389, "kl": 0.17469221353530884, "learning_rate": 6.344506829831685e-07, "loss": 0.007, "reward": 1.84765625, "reward_std": 0.7034391164779663, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.39453125, "step": 4437 }, { "completion_length": 131.15625, "epoch": 2.3745318352059925, "grad_norm": 1.0880831480026245, "kl": 0.19672641158103943, "learning_rate": 6.334148984784414e-07, "loss": 0.0079, "reward": 2.463156223297119, "reward_std": 0.8361638784408569, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47878122329711914, "step": 4438 }, { "completion_length": 113.4375, "epoch": 2.3750668806848583, "grad_norm": 1.0070135593414307, "kl": 0.22137749195098877, "learning_rate": 6.323798374978673e-07, "loss": 0.0089, "reward": 1.870437502861023, "reward_std": 0.6125388741493225, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.47981250286102295, "step": 4439 }, { "completion_length": 112.5625, "epoch": 2.375601926163724, "grad_norm": 1.370100975036621, "kl": 0.271738201379776, "learning_rate": 6.313455004426577e-07, "loss": 0.0109, "reward": 2.7064688205718994, "reward_std": 0.6247749328613281, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48771876096725464, "step": 4440 }, { "completion_length": 153.0625, "epoch": 2.3761369716425897, "grad_norm": 1.1256705522537231, "kl": 0.20942692458629608, "learning_rate": 6.303118877137404e-07, "loss": 0.0084, "reward": 1.774593710899353, "reward_std": 1.0219653844833374, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.430843710899353, "step": 4441 }, { "completion_length": 126.625, "epoch": 2.3766720171214555, "grad_norm": 0.9740155935287476, "kl": 0.14298418164253235, "learning_rate": 6.292789997117635e-07, "loss": 0.0057, "reward": 2.772531270980835, "reward_std": 0.6170290112495422, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4912812411785126, "step": 4442 }, { "completion_length": 134.75, "epoch": 2.377207062600321, "grad_norm": 0.43065351247787476, "kl": 0.1500261127948761, "learning_rate": 6.282468368370934e-07, "loss": 0.006, "reward": 2.31640625, "reward_std": 0.3546988368034363, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.48828125, "step": 4443 }, { "completion_length": 118.34375, "epoch": 2.377742108079187, "grad_norm": 0.5996344089508057, "kl": 0.2262634038925171, "learning_rate": 6.272153994898178e-07, "loss": 0.0091, "reward": 1.8077499866485596, "reward_std": 0.5230122208595276, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.47962498664855957, "step": 4444 }, { "completion_length": 133.5625, "epoch": 2.3782771535580522, "grad_norm": 1.3674213886260986, "kl": 0.19011661410331726, "learning_rate": 6.261846880697405e-07, "loss": 0.0076, "reward": 1.804593801498413, "reward_std": 0.465416818857193, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4764687418937683, "step": 4445 }, { "completion_length": 121.0625, "epoch": 2.378812199036918, "grad_norm": 0.6966620087623596, "kl": 0.18085822463035583, "learning_rate": 6.251547029763852e-07, "loss": 0.0072, "reward": 2.3039374351501465, "reward_std": 0.8459634780883789, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.49143749475479126, "step": 4446 }, { "completion_length": 123.40625, "epoch": 2.3793472445157837, "grad_norm": 1.0938866138458252, "kl": 0.24612212181091309, "learning_rate": 6.241254446089942e-07, "loss": 0.0098, "reward": 2.721437454223633, "reward_std": 0.9956341981887817, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4870624840259552, "step": 4447 }, { "completion_length": 139.75, "epoch": 2.3798822899946495, "grad_norm": 7.914009094238281, "kl": 0.6215535402297974, "learning_rate": 6.230969133665277e-07, "loss": 0.0249, "reward": 1.1100938320159912, "reward_std": 0.5804443955421448, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.171875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.42259374260902405, "step": 4448 }, { "completion_length": 139.84375, "epoch": 2.380417335473515, "grad_norm": 0.7784817814826965, "kl": 0.1738012433052063, "learning_rate": 6.220691096476633e-07, "loss": 0.007, "reward": 2.2296249866485596, "reward_std": 0.8645710945129395, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.41712498664855957, "step": 4449 }, { "completion_length": 121.40625, "epoch": 2.380952380952381, "grad_norm": 3.067903518676758, "kl": 0.1961289942264557, "learning_rate": 6.210420338507986e-07, "loss": 0.0078, "reward": 2.3594374656677246, "reward_std": 0.7757546901702881, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.484437495470047, "step": 4450 }, { "completion_length": 117.0, "epoch": 2.3814874264312467, "grad_norm": 0.9074132442474365, "kl": 0.19226965308189392, "learning_rate": 6.200156863740476e-07, "loss": 0.0077, "reward": 2.152437448501587, "reward_std": 0.8470349907875061, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4805625081062317, "step": 4451 }, { "completion_length": 109.15625, "epoch": 2.3820224719101124, "grad_norm": 2.2159383296966553, "kl": 0.502164900302887, "learning_rate": 6.189900676152416e-07, "loss": 0.0201, "reward": 1.9869375228881836, "reward_std": 0.7138282656669617, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4869375228881836, "step": 4452 }, { "completion_length": 158.3125, "epoch": 2.382557517388978, "grad_norm": 0.9229987859725952, "kl": 0.17584921419620514, "learning_rate": 6.1796517797193e-07, "loss": 0.007, "reward": 1.9261562824249268, "reward_std": 1.0848331451416016, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.394906222820282, "step": 4453 }, { "completion_length": 124.34375, "epoch": 2.383092562867844, "grad_norm": 1.2641316652297974, "kl": 0.16251707077026367, "learning_rate": 6.169410178413809e-07, "loss": 0.0065, "reward": 2.6625313758850098, "reward_std": 1.0024566650390625, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.4906562566757202, "step": 4454 }, { "completion_length": 142.59375, "epoch": 2.3836276083467096, "grad_norm": 0.5389150977134705, "kl": 0.14272059500217438, "learning_rate": 6.159175876205775e-07, "loss": 0.0057, "reward": 1.401187539100647, "reward_std": 0.5059351921081543, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4324375092983246, "step": 4455 }, { "completion_length": 133.375, "epoch": 2.3841626538255754, "grad_norm": 1.0086376667022705, "kl": 0.21867743134498596, "learning_rate": 6.148948877062202e-07, "loss": 0.0087, "reward": 1.9989062547683716, "reward_std": 0.5804456472396851, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4364062547683716, "step": 4456 }, { "completion_length": 147.125, "epoch": 2.384697699304441, "grad_norm": 1.136457085609436, "kl": 0.20053459703922272, "learning_rate": 6.138729184947289e-07, "loss": 0.008, "reward": 1.3887187242507935, "reward_std": 0.8337932825088501, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.40434372425079346, "step": 4457 }, { "completion_length": 130.71875, "epoch": 2.3852327447833064, "grad_norm": 2.322277069091797, "kl": 0.18626776337623596, "learning_rate": 6.128516803822376e-07, "loss": 0.0075, "reward": 2.3668124675750732, "reward_std": 1.0315059423446655, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47618749737739563, "step": 4458 }, { "completion_length": 112.96875, "epoch": 2.385767790262172, "grad_norm": 0.7268316745758057, "kl": 0.15918058156967163, "learning_rate": 6.118311737645969e-07, "loss": 0.0064, "reward": 2.2968125343322754, "reward_std": 0.30314549803733826, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.499937504529953, "step": 4459 }, { "completion_length": 137.34375, "epoch": 2.386302835741038, "grad_norm": 1.6071585416793823, "kl": 0.19550853967666626, "learning_rate": 6.108113990373768e-07, "loss": 0.0078, "reward": 1.6430312395095825, "reward_std": 0.6509420871734619, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4867812395095825, "step": 4460 }, { "completion_length": 113.34375, "epoch": 2.3868378812199036, "grad_norm": 2.456418037414551, "kl": 0.4899313151836395, "learning_rate": 6.097923565958589e-07, "loss": 0.0196, "reward": 3.1624064445495605, "reward_std": 0.44579172134399414, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4749062657356262, "step": 4461 }, { "completion_length": 152.09375, "epoch": 2.3873729266987693, "grad_norm": 1.014927625656128, "kl": 0.18460848927497864, "learning_rate": 6.087740468350448e-07, "loss": 0.0074, "reward": 1.2252187728881836, "reward_std": 0.4861180782318115, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4127187728881836, "step": 4462 }, { "completion_length": 140.15625, "epoch": 2.387907972177635, "grad_norm": 0.5810077786445618, "kl": 0.18407857418060303, "learning_rate": 6.077564701496524e-07, "loss": 0.0074, "reward": 1.3706250190734863, "reward_std": 0.6201539635658264, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.43312498927116394, "step": 4463 }, { "completion_length": 137.53125, "epoch": 2.388443017656501, "grad_norm": 0.8121805787086487, "kl": 0.19114258885383606, "learning_rate": 6.067396269341111e-07, "loss": 0.0076, "reward": 1.986687421798706, "reward_std": 0.7841036319732666, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47106248140335083, "step": 4464 }, { "completion_length": 143.03125, "epoch": 2.3889780631353665, "grad_norm": 1.5312951803207397, "kl": 0.17432621121406555, "learning_rate": 6.05723517582571e-07, "loss": 0.007, "reward": 1.8026561737060547, "reward_std": 0.8938701748847961, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.42765623331069946, "step": 4465 }, { "completion_length": 118.9375, "epoch": 2.3895131086142323, "grad_norm": 1.1208823919296265, "kl": 0.1968020498752594, "learning_rate": 6.047081424888948e-07, "loss": 0.0079, "reward": 2.004906177520752, "reward_std": 0.587454617023468, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48928123712539673, "step": 4466 }, { "completion_length": 146.46875, "epoch": 2.390048154093098, "grad_norm": 0.7878599762916565, "kl": 0.1345280110836029, "learning_rate": 6.036935020466606e-07, "loss": 0.0054, "reward": 1.8692500591278076, "reward_std": 0.9776929616928101, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.44737499952316284, "step": 4467 }, { "completion_length": 123.78125, "epoch": 2.3905831995719637, "grad_norm": 0.7564030289649963, "kl": 0.21646419167518616, "learning_rate": 6.026795966491636e-07, "loss": 0.0087, "reward": 1.9763751029968262, "reward_std": 0.7529951930046082, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4451249837875366, "step": 4468 }, { "completion_length": 160.03125, "epoch": 2.3911182450508295, "grad_norm": 1.7439773082733154, "kl": 0.1701127290725708, "learning_rate": 6.016664266894129e-07, "loss": 0.0068, "reward": 1.5848437547683716, "reward_std": 1.0026923418045044, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4129687547683716, "step": 4469 }, { "completion_length": 131.28125, "epoch": 2.391653290529695, "grad_norm": 0.9374898076057434, "kl": 0.18893346190452576, "learning_rate": 6.006539925601321e-07, "loss": 0.0076, "reward": 1.9034061431884766, "reward_std": 0.45534849166870117, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4815312623977661, "step": 4470 }, { "completion_length": 116.53125, "epoch": 2.3921883360085605, "grad_norm": 1.3876569271087646, "kl": 0.31484052538871765, "learning_rate": 5.996422946537595e-07, "loss": 0.0126, "reward": 1.8516249656677246, "reward_std": 1.0888572931289673, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.476624995470047, "step": 4471 }, { "completion_length": 125.78125, "epoch": 2.3927233814874262, "grad_norm": 1.4897561073303223, "kl": 0.291159987449646, "learning_rate": 5.986313333624502e-07, "loss": 0.0116, "reward": 2.6232500076293945, "reward_std": 0.7088459730148315, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.49825000762939453, "step": 4472 }, { "completion_length": 135.71875, "epoch": 2.393258426966292, "grad_norm": 1.59063720703125, "kl": 0.21831777691841125, "learning_rate": 5.976211090780712e-07, "loss": 0.0087, "reward": 1.794374942779541, "reward_std": 0.7535276412963867, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4662500023841858, "step": 4473 }, { "completion_length": 103.53125, "epoch": 2.3937934724451577, "grad_norm": 0.7101821303367615, "kl": 0.22079309821128845, "learning_rate": 5.966116221922041e-07, "loss": 0.0088, "reward": 2.5859375, "reward_std": 0.7678993940353394, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4765625, "step": 4474 }, { "completion_length": 151.71875, "epoch": 2.3943285179240235, "grad_norm": 7.53611421585083, "kl": 0.9087886810302734, "learning_rate": 5.956028730961469e-07, "loss": 0.0364, "reward": 1.6036875247955322, "reward_std": 0.5262112617492676, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.38493749499320984, "step": 4475 }, { "completion_length": 152.96875, "epoch": 2.394863563402889, "grad_norm": 0.6413254737854004, "kl": 0.13487252593040466, "learning_rate": 5.945948621809092e-07, "loss": 0.0054, "reward": 1.7163749933242798, "reward_std": 0.5727565288543701, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.3882499933242798, "step": 4476 }, { "completion_length": 103.0625, "epoch": 2.395398608881755, "grad_norm": 1.2363156080245972, "kl": 0.21018198132514954, "learning_rate": 5.935875898372148e-07, "loss": 0.0084, "reward": 2.518812656402588, "reward_std": 0.7273101210594177, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47193750739097595, "step": 4477 }, { "completion_length": 123.375, "epoch": 2.3959336543606207, "grad_norm": 1.0724730491638184, "kl": 0.4361671507358551, "learning_rate": 5.925810564555031e-07, "loss": 0.0174, "reward": 1.9136250019073486, "reward_std": 0.5483254194259644, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47612500190734863, "step": 4478 }, { "completion_length": 140.09375, "epoch": 2.3964686998394864, "grad_norm": 0.6331452131271362, "kl": 0.1455874890089035, "learning_rate": 5.915752624259252e-07, "loss": 0.0058, "reward": 2.8057186603546143, "reward_std": 0.3621791899204254, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4775937497615814, "step": 4479 }, { "completion_length": 117.125, "epoch": 2.397003745318352, "grad_norm": 1.486696720123291, "kl": 0.16801297664642334, "learning_rate": 5.905702081383457e-07, "loss": 0.0067, "reward": 2.6677498817443848, "reward_std": 0.626493513584137, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4802500009536743, "step": 4480 }, { "completion_length": 112.9375, "epoch": 2.397538790797218, "grad_norm": 0.9467697143554688, "kl": 0.2384290099143982, "learning_rate": 5.895658939823428e-07, "loss": 0.0095, "reward": 2.1461563110351562, "reward_std": 0.7595570087432861, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4899062514305115, "step": 4481 }, { "completion_length": 112.96875, "epoch": 2.3980738362760836, "grad_norm": 0.4618895351886749, "kl": 0.16712921857833862, "learning_rate": 5.88562320347209e-07, "loss": 0.0067, "reward": 2.03515625, "reward_std": 0.3484736979007721, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48828125, "step": 4482 }, { "completion_length": 137.0625, "epoch": 2.3986088817549494, "grad_norm": 0.6285695433616638, "kl": 0.17434711754322052, "learning_rate": 5.875594876219479e-07, "loss": 0.007, "reward": 2.0115625858306885, "reward_std": 1.0061604976654053, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4646875262260437, "step": 4483 }, { "completion_length": 125.5, "epoch": 2.3991439272338146, "grad_norm": 0.9204445481300354, "kl": 0.23585835099220276, "learning_rate": 5.865573961952759e-07, "loss": 0.0094, "reward": 1.7960000038146973, "reward_std": 0.7354698777198792, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.43662500381469727, "step": 4484 }, { "completion_length": 139.96875, "epoch": 2.399678972712681, "grad_norm": 1.6845979690551758, "kl": 0.23899561166763306, "learning_rate": 5.855560464556253e-07, "loss": 0.0096, "reward": 1.9807813167572021, "reward_std": 0.5479981899261475, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4807812571525574, "step": 4485 }, { "completion_length": 126.6875, "epoch": 2.400214018191546, "grad_norm": 2.0000956058502197, "kl": 0.24353112280368805, "learning_rate": 5.845554387911353e-07, "loss": 0.0097, "reward": 1.99609375, "reward_std": 0.5559245347976685, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.49609375, "step": 4486 }, { "completion_length": 136.96875, "epoch": 2.400749063670412, "grad_norm": 1.549140453338623, "kl": 0.23803606629371643, "learning_rate": 5.835555735896622e-07, "loss": 0.0095, "reward": 1.625937581062317, "reward_std": 0.7579798698425293, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4696875214576721, "step": 4487 }, { "completion_length": 126.25, "epoch": 2.4012841091492776, "grad_norm": 1.6566022634506226, "kl": 0.22285400331020355, "learning_rate": 5.825564512387744e-07, "loss": 0.0089, "reward": 2.0444374084472656, "reward_std": 0.7278138399124146, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4350625276565552, "step": 4488 }, { "completion_length": 136.53125, "epoch": 2.4018191546281433, "grad_norm": 0.9562280178070068, "kl": 0.23422721028327942, "learning_rate": 5.815580721257477e-07, "loss": 0.0094, "reward": 1.8693125247955322, "reward_std": 0.7835609912872314, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.46306249499320984, "step": 4489 }, { "completion_length": 121.53125, "epoch": 2.402354200107009, "grad_norm": 0.6965190172195435, "kl": 0.18241262435913086, "learning_rate": 5.805604366375753e-07, "loss": 0.0073, "reward": 2.16825008392334, "reward_std": 0.6897464990615845, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46512502431869507, "step": 4490 }, { "completion_length": 120.625, "epoch": 2.402889245585875, "grad_norm": 1.4912620782852173, "kl": 0.1927003562450409, "learning_rate": 5.795635451609594e-07, "loss": 0.0077, "reward": 2.212249994277954, "reward_std": 0.5721009969711304, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4622499942779541, "step": 4491 }, { "completion_length": 127.71875, "epoch": 2.4034242910647405, "grad_norm": 0.6277855634689331, "kl": 0.2007540464401245, "learning_rate": 5.785673980823131e-07, "loss": 0.008, "reward": 1.9050312042236328, "reward_std": 0.6303253769874573, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4675312638282776, "step": 4492 }, { "completion_length": 106.78125, "epoch": 2.4039593365436063, "grad_norm": 0.5678234696388245, "kl": 0.20270872116088867, "learning_rate": 5.775719957877638e-07, "loss": 0.0081, "reward": 2.75, "reward_std": 0.7848104238510132, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 4493 }, { "completion_length": 156.40625, "epoch": 2.404494382022472, "grad_norm": 0.7915107607841492, "kl": 0.13417747616767883, "learning_rate": 5.765773386631474e-07, "loss": 0.0054, "reward": 1.859375, "reward_std": 0.7186411023139954, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4375, "step": 4494 }, { "completion_length": 139.4375, "epoch": 2.4050294275013377, "grad_norm": 1.4065773487091064, "kl": 0.1692855805158615, "learning_rate": 5.755834270940119e-07, "loss": 0.0068, "reward": 2.2383437156677246, "reward_std": 1.1778576374053955, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.457093745470047, "step": 4495 }, { "completion_length": 123.28125, "epoch": 2.4055644729802035, "grad_norm": 1.51456618309021, "kl": 0.1811186671257019, "learning_rate": 5.745902614656174e-07, "loss": 0.0072, "reward": 2.2368435859680176, "reward_std": 0.8959764242172241, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4868437647819519, "step": 4496 }, { "completion_length": 127.9375, "epoch": 2.4060995184590688, "grad_norm": 0.5771340727806091, "kl": 0.1897900551557541, "learning_rate": 5.735978421629338e-07, "loss": 0.0076, "reward": 2.2109375, "reward_std": 0.803614616394043, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4765625, "step": 4497 }, { "completion_length": 130.8125, "epoch": 2.406634563937935, "grad_norm": 0.6777713298797607, "kl": 0.15886196494102478, "learning_rate": 5.726061695706411e-07, "loss": 0.0064, "reward": 2.2820000648498535, "reward_std": 0.6693637371063232, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.45387500524520874, "step": 4498 }, { "completion_length": 140.46875, "epoch": 2.4071696094168002, "grad_norm": 1.4747642278671265, "kl": 0.1852973997592926, "learning_rate": 5.716152440731304e-07, "loss": 0.0074, "reward": 2.1818125247955322, "reward_std": 0.934522271156311, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.43181249499320984, "step": 4499 }, { "completion_length": 140.25, "epoch": 2.407704654895666, "grad_norm": 0.4950331151485443, "kl": 0.22902555763721466, "learning_rate": 5.706250660545045e-07, "loss": 0.0092, "reward": 1.5835624933242798, "reward_std": 0.7903404235839844, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4741874933242798, "step": 4500 }, { "completion_length": 125.53125, "epoch": 2.4082397003745317, "grad_norm": 3.647043228149414, "kl": 0.28520020842552185, "learning_rate": 5.696356358985752e-07, "loss": 0.0114, "reward": 2.5645625591278076, "reward_std": 0.8087431192398071, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47081252932548523, "step": 4501 }, { "completion_length": 143.3125, "epoch": 2.4087747458533975, "grad_norm": 0.9582520127296448, "kl": 0.1943955421447754, "learning_rate": 5.686469539888633e-07, "loss": 0.0078, "reward": 1.9181874990463257, "reward_std": 0.8230019807815552, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4650624990463257, "step": 4502 }, { "completion_length": 130.03125, "epoch": 2.409309791332263, "grad_norm": 1.7630980014801025, "kl": 0.2867914140224457, "learning_rate": 5.676590207086036e-07, "loss": 0.0115, "reward": 1.7157812118530273, "reward_std": 1.0865938663482666, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.45015624165534973, "step": 4503 }, { "completion_length": 125.125, "epoch": 2.409844836811129, "grad_norm": 0.6246693730354309, "kl": 0.18074458837509155, "learning_rate": 5.666718364407342e-07, "loss": 0.0072, "reward": 1.9585312604904175, "reward_std": 0.5289390087127686, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4585312306880951, "step": 4504 }, { "completion_length": 144.28125, "epoch": 2.4103798822899947, "grad_norm": 0.7483855485916138, "kl": 0.1567685604095459, "learning_rate": 5.656854015679091e-07, "loss": 0.0063, "reward": 1.3877500295639038, "reward_std": 0.4237895905971527, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4658749997615814, "step": 4505 }, { "completion_length": 112.96875, "epoch": 2.4109149277688604, "grad_norm": 1.478447437286377, "kl": 0.1824210286140442, "learning_rate": 5.646997164724901e-07, "loss": 0.0073, "reward": 2.4834063053131104, "reward_std": 0.8781115412712097, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4834062457084656, "step": 4506 }, { "completion_length": 156.375, "epoch": 2.411449973247726, "grad_norm": 5.446552276611328, "kl": 0.23318269848823547, "learning_rate": 5.637147815365448e-07, "loss": 0.0093, "reward": 1.027500033378601, "reward_std": 0.40901151299476624, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.25, "rewards/xmlcount_reward_func": 0.3243750035762787, "step": 4507 }, { "completion_length": 121.03125, "epoch": 2.411985018726592, "grad_norm": 0.6759741902351379, "kl": 0.17400223016738892, "learning_rate": 5.627305971418556e-07, "loss": 0.007, "reward": 2.953125, "reward_std": 0.4182470440864563, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.5, "step": 4508 }, { "completion_length": 135.96875, "epoch": 2.4125200642054576, "grad_norm": 1.2116531133651733, "kl": 0.19854304194450378, "learning_rate": 5.617471636699098e-07, "loss": 0.0079, "reward": 2.1268436908721924, "reward_std": 0.7412816882133484, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.45496875047683716, "step": 4509 }, { "completion_length": 108.625, "epoch": 2.4130551096843234, "grad_norm": 0.8686718940734863, "kl": 0.3094756603240967, "learning_rate": 5.607644815019047e-07, "loss": 0.0124, "reward": 2.735875129699707, "reward_std": 0.7197158932685852, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4702499806880951, "step": 4510 }, { "completion_length": 114.71875, "epoch": 2.413590155163189, "grad_norm": 0.9780305027961731, "kl": 0.1880468875169754, "learning_rate": 5.59782551018748e-07, "loss": 0.0075, "reward": 2.55078125, "reward_std": 0.7799391746520996, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48828125, "step": 4511 }, { "completion_length": 118.125, "epoch": 2.4141252006420544, "grad_norm": 1.026250958442688, "kl": 0.17571710050106049, "learning_rate": 5.588013726010544e-07, "loss": 0.007, "reward": 2.352781295776367, "reward_std": 0.7993285655975342, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4465312361717224, "step": 4512 }, { "completion_length": 135.71875, "epoch": 2.41466024612092, "grad_norm": 0.8345706462860107, "kl": 0.17504680156707764, "learning_rate": 5.57820946629147e-07, "loss": 0.007, "reward": 2.5630624294281006, "reward_std": 1.1230448484420776, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.46931248903274536, "step": 4513 }, { "completion_length": 152.75, "epoch": 2.415195291599786, "grad_norm": 1.0358601808547974, "kl": 0.15220746397972107, "learning_rate": 5.568412734830572e-07, "loss": 0.0061, "reward": 1.5775938034057617, "reward_std": 0.8399003744125366, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.43696874380111694, "step": 4514 }, { "completion_length": 165.9375, "epoch": 2.4157303370786516, "grad_norm": 3.4160521030426025, "kl": 0.2414984405040741, "learning_rate": 5.558623535425267e-07, "loss": 0.0097, "reward": 1.2852187156677246, "reward_std": 0.9666936993598938, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.378968745470047, "step": 4515 }, { "completion_length": 114.71875, "epoch": 2.4162653825575173, "grad_norm": 1.2512677907943726, "kl": 0.33634987473487854, "learning_rate": 5.548841871870025e-07, "loss": 0.0135, "reward": 2.546875, "reward_std": 0.7658787965774536, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 4516 }, { "completion_length": 134.59375, "epoch": 2.416800428036383, "grad_norm": 0.7932771444320679, "kl": 0.15539482235908508, "learning_rate": 5.539067747956403e-07, "loss": 0.0062, "reward": 2.1464061737060547, "reward_std": 0.9131191968917847, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.41203123331069946, "step": 4517 }, { "completion_length": 119.65625, "epoch": 2.417335473515249, "grad_norm": 1.9629533290863037, "kl": 0.2970767319202423, "learning_rate": 5.52930116747305e-07, "loss": 0.0119, "reward": 2.355062484741211, "reward_std": 0.9969806671142578, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4800625145435333, "step": 4518 }, { "completion_length": 138.65625, "epoch": 2.4178705189941145, "grad_norm": 1.1469718217849731, "kl": 0.29405683279037476, "learning_rate": 5.519542134205674e-07, "loss": 0.0118, "reward": 1.7905311584472656, "reward_std": 0.6515506505966187, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4311562478542328, "step": 4519 }, { "completion_length": 139.5, "epoch": 2.4184055644729803, "grad_norm": 0.7160325646400452, "kl": 0.15490859746932983, "learning_rate": 5.509790651937058e-07, "loss": 0.0062, "reward": 1.9609375, "reward_std": 0.5808281898498535, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4765625, "step": 4520 }, { "completion_length": 123.875, "epoch": 2.418940609951846, "grad_norm": 0.8767622709274292, "kl": 0.18059608340263367, "learning_rate": 5.500046724447075e-07, "loss": 0.0072, "reward": 2.0600624084472656, "reward_std": 0.7863286733627319, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4819374978542328, "step": 4521 }, { "completion_length": 99.84375, "epoch": 2.4194756554307117, "grad_norm": 1.0639389753341675, "kl": 0.24046199023723602, "learning_rate": 5.490310355512651e-07, "loss": 0.0096, "reward": 3.0674688816070557, "reward_std": 0.8539048433303833, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4893437623977661, "step": 4522 }, { "completion_length": 134.09375, "epoch": 2.4200107009095775, "grad_norm": 0.7537253499031067, "kl": 0.19996492564678192, "learning_rate": 5.48058154890779e-07, "loss": 0.008, "reward": 1.825374960899353, "reward_std": 0.3661111891269684, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4503750205039978, "step": 4523 }, { "completion_length": 126.40625, "epoch": 2.420545746388443, "grad_norm": 0.707417368888855, "kl": 0.15695101022720337, "learning_rate": 5.470860308403558e-07, "loss": 0.0063, "reward": 2.605343818664551, "reward_std": 0.5757735371589661, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.495968759059906, "step": 4524 }, { "completion_length": 152.03125, "epoch": 2.4210807918673085, "grad_norm": 0.8476465344429016, "kl": 0.18671084940433502, "learning_rate": 5.461146637768105e-07, "loss": 0.0075, "reward": 1.888124942779541, "reward_std": 0.8419650793075562, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4818750023841858, "step": 4525 }, { "completion_length": 114.96875, "epoch": 2.4216158373461742, "grad_norm": 1.4697508811950684, "kl": 0.31185442209243774, "learning_rate": 5.451440540766631e-07, "loss": 0.0125, "reward": 2.652437448501587, "reward_std": 1.003706693649292, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4493125081062317, "step": 4526 }, { "completion_length": 134.8125, "epoch": 2.42215088282504, "grad_norm": 0.47568562626838684, "kl": 0.13070163130760193, "learning_rate": 5.441742021161398e-07, "loss": 0.0052, "reward": 2.357156276702881, "reward_std": 0.22009660303592682, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46653127670288086, "step": 4527 }, { "completion_length": 112.78125, "epoch": 2.4226859283039057, "grad_norm": 0.7551901936531067, "kl": 0.22200621664524078, "learning_rate": 5.432051082711753e-07, "loss": 0.0089, "reward": 2.711750030517578, "reward_std": 0.6591353416442871, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4773750305175781, "step": 4528 }, { "completion_length": 138.75, "epoch": 2.4232209737827715, "grad_norm": 1.643398642539978, "kl": 0.3057250380516052, "learning_rate": 5.42236772917408e-07, "loss": 0.0122, "reward": 1.3895937204360962, "reward_std": 0.7173582315444946, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.4208437502384186, "step": 4529 }, { "completion_length": 125.09375, "epoch": 2.423756019261637, "grad_norm": 2.8575358390808105, "kl": 0.19492414593696594, "learning_rate": 5.412691964301828e-07, "loss": 0.0078, "reward": 1.75390625, "reward_std": 0.6774451732635498, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48828125, "step": 4530 }, { "completion_length": 127.9375, "epoch": 2.424291064740503, "grad_norm": 2.0248894691467285, "kl": 0.21861785650253296, "learning_rate": 5.403023791845527e-07, "loss": 0.0087, "reward": 2.5459375381469727, "reward_std": 0.8814446926116943, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46781250834465027, "step": 4531 }, { "completion_length": 172.65625, "epoch": 2.4248261102193687, "grad_norm": 2.9361987113952637, "kl": 0.15716859698295593, "learning_rate": 5.393363215552721e-07, "loss": 0.0063, "reward": 1.2638437747955322, "reward_std": 0.8372072577476501, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.38884374499320984, "step": 4532 }, { "completion_length": 152.15625, "epoch": 2.4253611556982344, "grad_norm": 0.8548313975334167, "kl": 0.20344148576259613, "learning_rate": 5.383710239168047e-07, "loss": 0.0081, "reward": 1.687343716621399, "reward_std": 0.47449833154678345, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4685937464237213, "step": 4533 }, { "completion_length": 124.375, "epoch": 2.4258962011771, "grad_norm": 1.3889820575714111, "kl": 0.20939025282859802, "learning_rate": 5.374064866433196e-07, "loss": 0.0084, "reward": 2.16825008392334, "reward_std": 0.4626789689064026, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.4963749945163727, "step": 4534 }, { "completion_length": 138.375, "epoch": 2.426431246655966, "grad_norm": 0.525854229927063, "kl": 0.15111865103244781, "learning_rate": 5.364427101086874e-07, "loss": 0.006, "reward": 1.8883438110351562, "reward_std": 0.6792985796928406, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4664687514305115, "step": 4535 }, { "completion_length": 124.9375, "epoch": 2.4269662921348316, "grad_norm": 1.2097241878509521, "kl": 0.14917609095573425, "learning_rate": 5.354796946864885e-07, "loss": 0.006, "reward": 2.1275625228881836, "reward_std": 0.8602977991104126, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4556874930858612, "step": 4536 }, { "completion_length": 137.4375, "epoch": 2.4275013376136974, "grad_norm": 0.6986331343650818, "kl": 0.19720469415187836, "learning_rate": 5.345174407500051e-07, "loss": 0.0079, "reward": 2.242374897003174, "reward_std": 0.6512336134910583, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4455000162124634, "step": 4537 }, { "completion_length": 110.4375, "epoch": 2.4280363830925626, "grad_norm": 2.8118367195129395, "kl": 0.46950721740722656, "learning_rate": 5.33555948672225e-07, "loss": 0.0188, "reward": 2.6464686393737793, "reward_std": 0.9279440641403198, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.49021875858306885, "step": 4538 }, { "completion_length": 128.125, "epoch": 2.4285714285714284, "grad_norm": 1.4302947521209717, "kl": 0.18349651992321014, "learning_rate": 5.325952188258418e-07, "loss": 0.0073, "reward": 2.38853120803833, "reward_std": 0.4710986018180847, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.48228126764297485, "step": 4539 }, { "completion_length": 112.75, "epoch": 2.429106474050294, "grad_norm": 30.944211959838867, "kl": 0.384215384721756, "learning_rate": 5.316352515832526e-07, "loss": 0.0154, "reward": 2.4375, "reward_std": 0.6123279929161072, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 4540 }, { "completion_length": 146.0625, "epoch": 2.42964151952916, "grad_norm": 0.9839821457862854, "kl": 0.13019107282161713, "learning_rate": 5.306760473165587e-07, "loss": 0.0052, "reward": 2.230968713760376, "reward_std": 0.9186921119689941, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.41846874356269836, "step": 4541 }, { "completion_length": 134.28125, "epoch": 2.4301765650080256, "grad_norm": 1.816550612449646, "kl": 0.2025110423564911, "learning_rate": 5.297176063975656e-07, "loss": 0.0081, "reward": 1.8359375, "reward_std": 0.6951798796653748, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4921875, "step": 4542 }, { "completion_length": 133.75, "epoch": 2.4307116104868913, "grad_norm": 0.825288712978363, "kl": 0.13942907750606537, "learning_rate": 5.287599291977849e-07, "loss": 0.0056, "reward": 2.44140625, "reward_std": 0.7715511322021484, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.48828125, "step": 4543 }, { "completion_length": 145.4375, "epoch": 2.431246655965757, "grad_norm": 0.8102399110794067, "kl": 0.1770215630531311, "learning_rate": 5.278030160884296e-07, "loss": 0.0071, "reward": 1.5253437757492065, "reward_std": 0.7622177600860596, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.44721874594688416, "step": 4544 }, { "completion_length": 114.09375, "epoch": 2.431781701444623, "grad_norm": 1.84520423412323, "kl": 0.20200957357883453, "learning_rate": 5.268468674404171e-07, "loss": 0.0081, "reward": 2.1972498893737793, "reward_std": 0.6803151369094849, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.46287500858306885, "step": 4545 }, { "completion_length": 144.15625, "epoch": 2.4323167469234885, "grad_norm": 1.1036370992660522, "kl": 0.20751407742500305, "learning_rate": 5.2589148362437e-07, "loss": 0.0083, "reward": 1.2442187070846558, "reward_std": 0.6670543551445007, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.28125, "rewards/xmlcount_reward_func": 0.44734376668930054, "step": 4546 }, { "completion_length": 122.125, "epoch": 2.4328517924023543, "grad_norm": 0.6903401017189026, "kl": 0.21500390768051147, "learning_rate": 5.249368650106132e-07, "loss": 0.0086, "reward": 2.1939375400543213, "reward_std": 0.7183821201324463, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4595624804496765, "step": 4547 }, { "completion_length": 139.125, "epoch": 2.43338683788122, "grad_norm": 0.6132493615150452, "kl": 0.19435441493988037, "learning_rate": 5.239830119691741e-07, "loss": 0.0078, "reward": 1.695468783378601, "reward_std": 0.4281473457813263, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4767187535762787, "step": 4548 }, { "completion_length": 126.40625, "epoch": 2.4339218833600857, "grad_norm": 1.6445783376693726, "kl": 0.20482110977172852, "learning_rate": 5.230299248697867e-07, "loss": 0.0082, "reward": 2.5609374046325684, "reward_std": 0.8589293956756592, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4515625238418579, "step": 4549 }, { "completion_length": 135.78125, "epoch": 2.4344569288389515, "grad_norm": 1.2624582052230835, "kl": 0.17877185344696045, "learning_rate": 5.220776040818828e-07, "loss": 0.0072, "reward": 1.7476249933242798, "reward_std": 0.8843177556991577, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4507499933242798, "step": 4550 }, { "completion_length": 120.0, "epoch": 2.4349919743178168, "grad_norm": 2.9189274311065674, "kl": 0.3147195875644684, "learning_rate": 5.211260499746027e-07, "loss": 0.0126, "reward": 2.1246562004089355, "reward_std": 1.0990216732025146, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4840312600135803, "step": 4551 }, { "completion_length": 139.1875, "epoch": 2.435527019796683, "grad_norm": 1.0010926723480225, "kl": 0.26886993646621704, "learning_rate": 5.201752629167855e-07, "loss": 0.0108, "reward": 1.5100936889648438, "reward_std": 0.901764988899231, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4475937485694885, "step": 4552 }, { "completion_length": 115.84375, "epoch": 2.4360620652755482, "grad_norm": 0.6576729416847229, "kl": 0.2074081301689148, "learning_rate": 5.192252432769748e-07, "loss": 0.0083, "reward": 2.284468650817871, "reward_std": 0.39263367652893066, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47196877002716064, "step": 4553 }, { "completion_length": 110.34375, "epoch": 2.436597110754414, "grad_norm": 0.8737067580223083, "kl": 0.22079060971736908, "learning_rate": 5.18275991423417e-07, "loss": 0.0088, "reward": 2.6286563873291016, "reward_std": 0.6512125730514526, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.488031268119812, "step": 4554 }, { "completion_length": 140.96875, "epoch": 2.4371321562332797, "grad_norm": 2778.6396484375, "kl": 235.1746826171875, "learning_rate": 5.173275077240602e-07, "loss": 9.407, "reward": 1.4164999723434448, "reward_std": 0.8742343187332153, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4321250021457672, "step": 4555 }, { "completion_length": 144.8125, "epoch": 2.4376672017121455, "grad_norm": 1.3945845365524292, "kl": 0.16467711329460144, "learning_rate": 5.163797925465544e-07, "loss": 0.0066, "reward": 2.2353124618530273, "reward_std": 0.9495075345039368, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4696875214576721, "step": 4556 }, { "completion_length": 125.46875, "epoch": 2.438202247191011, "grad_norm": 3.8732359409332275, "kl": 0.21268683671951294, "learning_rate": 5.154328462582517e-07, "loss": 0.0085, "reward": 1.475968837738037, "reward_std": 0.5843921899795532, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.46034374833106995, "step": 4557 }, { "completion_length": 127.5625, "epoch": 2.438737292669877, "grad_norm": 1.190703272819519, "kl": 0.19360490143299103, "learning_rate": 5.144866692262082e-07, "loss": 0.0077, "reward": 1.69350004196167, "reward_std": 0.5167481899261475, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.45912498235702515, "step": 4558 }, { "completion_length": 120.59375, "epoch": 2.4392723381487427, "grad_norm": 0.8300957083702087, "kl": 0.1936565637588501, "learning_rate": 5.13541261817179e-07, "loss": 0.0077, "reward": 2.261593818664551, "reward_std": 0.5522163510322571, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.495968759059906, "step": 4559 }, { "completion_length": 146.1875, "epoch": 2.4398073836276084, "grad_norm": 3.0761630535125732, "kl": 0.1884870082139969, "learning_rate": 5.125966243976218e-07, "loss": 0.0075, "reward": 1.7995312213897705, "reward_std": 0.868259608745575, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4245312511920929, "step": 4560 }, { "completion_length": 140.15625, "epoch": 2.440342429106474, "grad_norm": 0.6551784873008728, "kl": 0.165274515748024, "learning_rate": 5.116527573336977e-07, "loss": 0.0066, "reward": 2.890625, "reward_std": 0.8119882345199585, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.5, "step": 4561 }, { "completion_length": 143.40625, "epoch": 2.44087747458534, "grad_norm": 1.0469778776168823, "kl": 0.14116792380809784, "learning_rate": 5.107096609912668e-07, "loss": 0.0056, "reward": 1.953125, "reward_std": 0.6630094051361084, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4375, "step": 4562 }, { "completion_length": 128.1875, "epoch": 2.4414125200642056, "grad_norm": 1.2160309553146362, "kl": 0.1631436049938202, "learning_rate": 5.097673357358906e-07, "loss": 0.0065, "reward": 2.564500093460083, "reward_std": 0.8284142017364502, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.43949997425079346, "step": 4563 }, { "completion_length": 106.34375, "epoch": 2.4419475655430714, "grad_norm": 1.0188019275665283, "kl": 0.18655449151992798, "learning_rate": 5.088257819328338e-07, "loss": 0.0075, "reward": 2.625, "reward_std": 0.8535137176513672, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.5, "step": 4564 }, { "completion_length": 129.1875, "epoch": 2.442482611021937, "grad_norm": 0.4207308888435364, "kl": 0.15796558558940887, "learning_rate": 5.078849999470598e-07, "loss": 0.0063, "reward": 1.921875, "reward_std": 0.5974536538124084, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.5, "step": 4565 }, { "completion_length": 131.59375, "epoch": 2.4430176565008024, "grad_norm": 1.6386562585830688, "kl": 0.19003283977508545, "learning_rate": 5.069449901432336e-07, "loss": 0.0076, "reward": 2.1845626831054688, "reward_std": 0.6119023561477661, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4814375042915344, "step": 4566 }, { "completion_length": 129.28125, "epoch": 2.443552701979668, "grad_norm": 2.9486422538757324, "kl": 0.16777074337005615, "learning_rate": 5.060057528857218e-07, "loss": 0.0067, "reward": 1.6625624895095825, "reward_std": 0.8898983001708984, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4594374895095825, "step": 4567 }, { "completion_length": 135.4375, "epoch": 2.444087747458534, "grad_norm": 1.016507625579834, "kl": 0.1679050326347351, "learning_rate": 5.050672885385904e-07, "loss": 0.0067, "reward": 2.1190624237060547, "reward_std": 0.9304207563400269, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.47843748331069946, "step": 4568 }, { "completion_length": 117.9375, "epoch": 2.4446227929373996, "grad_norm": 2.5939784049987793, "kl": 0.22083133459091187, "learning_rate": 5.04129597465606e-07, "loss": 0.0088, "reward": 2.1237499713897705, "reward_std": 0.7719099521636963, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4675000011920929, "step": 4569 }, { "completion_length": 132.8125, "epoch": 2.4451578384162653, "grad_norm": 0.9357548952102661, "kl": 0.16999584436416626, "learning_rate": 5.031926800302353e-07, "loss": 0.0068, "reward": 2.240687370300293, "reward_std": 0.4297616183757782, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4438124895095825, "step": 4570 }, { "completion_length": 119.9375, "epoch": 2.445692883895131, "grad_norm": 1.308850884437561, "kl": 0.27354171872138977, "learning_rate": 5.022565365956464e-07, "loss": 0.0109, "reward": 2.430906295776367, "reward_std": 0.685810923576355, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4934062361717224, "step": 4571 }, { "completion_length": 116.875, "epoch": 2.446227929373997, "grad_norm": 0.4547680914402008, "kl": 0.1662978231906891, "learning_rate": 5.013211675247057e-07, "loss": 0.0067, "reward": 2.734250068664551, "reward_std": 0.6073507070541382, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.499875009059906, "step": 4572 }, { "completion_length": 115.09375, "epoch": 2.4467629748528625, "grad_norm": 1.1021283864974976, "kl": 0.21767425537109375, "learning_rate": 5.003865731799801e-07, "loss": 0.0087, "reward": 1.7533750534057617, "reward_std": 0.35164737701416016, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.47212502360343933, "step": 4573 }, { "completion_length": 116.90625, "epoch": 2.4472980203317283, "grad_norm": 1.1962969303131104, "kl": 0.2663285732269287, "learning_rate": 4.994527539237376e-07, "loss": 0.0107, "reward": 1.82631254196167, "reward_std": 0.7641507387161255, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.45131248235702515, "step": 4574 }, { "completion_length": 115.03125, "epoch": 2.447833065810594, "grad_norm": 1.6573344469070435, "kl": 0.21575434505939484, "learning_rate": 4.98519710117942e-07, "loss": 0.0086, "reward": 2.2791874408721924, "reward_std": 0.9558627605438232, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48231250047683716, "step": 4575 }, { "completion_length": 137.0625, "epoch": 2.4483681112894597, "grad_norm": 0.6314204335212708, "kl": 0.15869803726673126, "learning_rate": 4.975874421242605e-07, "loss": 0.0063, "reward": 2.098843812942505, "reward_std": 0.6306050419807434, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4425937533378601, "step": 4576 }, { "completion_length": 135.1875, "epoch": 2.4489031567683255, "grad_norm": 0.8150301575660706, "kl": 0.1894226372241974, "learning_rate": 4.966559503040591e-07, "loss": 0.0076, "reward": 1.9185937643051147, "reward_std": 0.6939559578895569, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48109376430511475, "step": 4577 }, { "completion_length": 154.28125, "epoch": 2.449438202247191, "grad_norm": 0.851051926612854, "kl": 0.29442229866981506, "learning_rate": 4.957252350183994e-07, "loss": 0.0118, "reward": 1.8691251277923584, "reward_std": 0.9883982539176941, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.43162500858306885, "step": 4578 }, { "completion_length": 103.28125, "epoch": 2.4499732477260565, "grad_norm": 0.8807640075683594, "kl": 0.18354319036006927, "learning_rate": 4.947952966280467e-07, "loss": 0.0073, "reward": 2.285031318664551, "reward_std": 0.9709335565567017, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.488156259059906, "step": 4579 }, { "completion_length": 135.53125, "epoch": 2.4505082932049222, "grad_norm": 1.2738926410675049, "kl": 0.16412171721458435, "learning_rate": 4.938661354934615e-07, "loss": 0.0066, "reward": 2.120968818664551, "reward_std": 0.8958861827850342, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.480343759059906, "step": 4580 }, { "completion_length": 125.0625, "epoch": 2.451043338683788, "grad_norm": 1.7639286518096924, "kl": 0.2745320200920105, "learning_rate": 4.929377519748046e-07, "loss": 0.011, "reward": 1.7698125839233398, "reward_std": 0.7797203660011292, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4729374945163727, "step": 4581 }, { "completion_length": 117.59375, "epoch": 2.4515783841626537, "grad_norm": 1.4277037382125854, "kl": 0.23754990100860596, "learning_rate": 4.920101464319363e-07, "loss": 0.0095, "reward": 2.3830623626708984, "reward_std": 0.7503191828727722, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.492437481880188, "step": 4582 }, { "completion_length": 131.0625, "epoch": 2.4521134296415195, "grad_norm": 0.8386632204055786, "kl": 0.20189473032951355, "learning_rate": 4.910833192244135e-07, "loss": 0.0081, "reward": 1.9210624694824219, "reward_std": 1.04011869430542, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4523124694824219, "step": 4583 }, { "completion_length": 148.75, "epoch": 2.452648475120385, "grad_norm": 1703313.375, "kl": 11264.0869140625, "learning_rate": 4.901572707114921e-07, "loss": 450.5635, "reward": 1.7444062232971191, "reward_std": 0.7494992017745972, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.40065625309944153, "step": 4584 }, { "completion_length": 111.65625, "epoch": 2.453183520599251, "grad_norm": 1.491292119026184, "kl": 0.26723936200141907, "learning_rate": 4.89232001252126e-07, "loss": 0.0107, "reward": 2.543687582015991, "reward_std": 0.8461236953735352, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.48118749260902405, "step": 4585 }, { "completion_length": 135.9375, "epoch": 2.4537185660781167, "grad_norm": 2.08482027053833, "kl": 0.16492483019828796, "learning_rate": 4.883075112049682e-07, "loss": 0.0066, "reward": 2.008906364440918, "reward_std": 0.9034667611122131, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4620312452316284, "step": 4586 }, { "completion_length": 129.875, "epoch": 2.4542536115569824, "grad_norm": 1.4006997346878052, "kl": 0.17788589000701904, "learning_rate": 4.873838009283685e-07, "loss": 0.0071, "reward": 2.0, "reward_std": 0.6454803347587585, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4375, "step": 4587 }, { "completion_length": 135.28125, "epoch": 2.454788657035848, "grad_norm": 1.2187601327896118, "kl": 0.15073879063129425, "learning_rate": 4.86460870780374e-07, "loss": 0.006, "reward": 1.9201250076293945, "reward_std": 0.7005075216293335, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46700000762939453, "step": 4588 }, { "completion_length": 121.03125, "epoch": 2.455323702514714, "grad_norm": 0.7006935477256775, "kl": 0.14817503094673157, "learning_rate": 4.85538721118731e-07, "loss": 0.0059, "reward": 2.301093578338623, "reward_std": 0.4983524680137634, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4729687571525574, "step": 4589 }, { "completion_length": 137.21875, "epoch": 2.4558587479935796, "grad_norm": 1.841073989868164, "kl": 0.16993369162082672, "learning_rate": 4.846173523008823e-07, "loss": 0.0068, "reward": 2.0546875, "reward_std": 1.0552289485931396, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4296875, "step": 4590 }, { "completion_length": 135.8125, "epoch": 2.4563937934724454, "grad_norm": 0.8557561635971069, "kl": 0.19592422246932983, "learning_rate": 4.836967646839672e-07, "loss": 0.0078, "reward": 2.20703125, "reward_std": 0.8940224647521973, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48828125, "step": 4591 }, { "completion_length": 122.53125, "epoch": 2.4569288389513106, "grad_norm": 2.9798877239227295, "kl": 0.24810978770256042, "learning_rate": 4.82776958624824e-07, "loss": 0.0099, "reward": 1.55859375, "reward_std": 0.35475069284439087, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.49609375, "step": 4592 }, { "completion_length": 129.03125, "epoch": 2.4574638844301764, "grad_norm": 0.6386441588401794, "kl": 0.2000543475151062, "learning_rate": 4.818579344799873e-07, "loss": 0.008, "reward": 2.0693750381469727, "reward_std": 1.0292385816574097, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46000000834465027, "step": 4593 }, { "completion_length": 123.75, "epoch": 2.457998929909042, "grad_norm": 1.6493864059448242, "kl": 0.1618136763572693, "learning_rate": 4.809396926056881e-07, "loss": 0.0065, "reward": 2.4375312328338623, "reward_std": 0.5877609848976135, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4375312328338623, "step": 4594 }, { "completion_length": 115.1875, "epoch": 2.458533975387908, "grad_norm": 0.9164217114448547, "kl": 0.2651642858982086, "learning_rate": 4.800222333578536e-07, "loss": 0.0106, "reward": 2.0319061279296875, "reward_std": 0.770763635635376, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48503124713897705, "step": 4595 }, { "completion_length": 110.6875, "epoch": 2.4590690208667736, "grad_norm": 0.8735861778259277, "kl": 0.2060796618461609, "learning_rate": 4.791055570921102e-07, "loss": 0.0082, "reward": 2.343656301498413, "reward_std": 0.8543386459350586, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4842812418937683, "step": 4596 }, { "completion_length": 118.625, "epoch": 2.4596040663456393, "grad_norm": 0.6199306845664978, "kl": 0.18930527567863464, "learning_rate": 4.781896641637787e-07, "loss": 0.0076, "reward": 1.5940001010894775, "reward_std": 0.4722575545310974, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.468999981880188, "step": 4597 }, { "completion_length": 160.03125, "epoch": 2.460139111824505, "grad_norm": 0.9917996525764465, "kl": 0.2900323271751404, "learning_rate": 4.772745549278757e-07, "loss": 0.0116, "reward": 1.4416875839233398, "reward_std": 0.6128990650177002, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4260624945163727, "step": 4598 }, { "completion_length": 142.25, "epoch": 2.460674157303371, "grad_norm": 6.289154052734375, "kl": 0.34223026037216187, "learning_rate": 4.7636022973911676e-07, "loss": 0.0137, "reward": 2.337249994277954, "reward_std": 1.1747994422912598, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4309999942779541, "step": 4599 }, { "completion_length": 123.25, "epoch": 2.4612092027822365, "grad_norm": 0.7008895874023438, "kl": 0.16447186470031738, "learning_rate": 4.7544668895191076e-07, "loss": 0.0066, "reward": 2.4330313205718994, "reward_std": 0.44505107402801514, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.44865626096725464, "step": 4600 }, { "completion_length": 129.75, "epoch": 2.4617442482611023, "grad_norm": 0.4625491797924042, "kl": 0.15504074096679688, "learning_rate": 4.7453393292036326e-07, "loss": 0.0062, "reward": 2.09375, "reward_std": 0.5752565860748291, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46875, "step": 4601 }, { "completion_length": 108.625, "epoch": 2.462279293739968, "grad_norm": 0.8646864891052246, "kl": 0.20879530906677246, "learning_rate": 4.736219619982782e-07, "loss": 0.0084, "reward": 2.650624990463257, "reward_std": 0.9538778066635132, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.49437499046325684, "step": 4602 }, { "completion_length": 153.84375, "epoch": 2.4628143392188337, "grad_norm": 0.6784641742706299, "kl": 0.12947238981723785, "learning_rate": 4.7271077653914977e-07, "loss": 0.0052, "reward": 1.8291250467300415, "reward_std": 1.1531232595443726, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 4603 }, { "completion_length": 141.96875, "epoch": 2.4633493846976995, "grad_norm": 1.2622005939483643, "kl": 0.2641424834728241, "learning_rate": 4.718003768961732e-07, "loss": 0.0106, "reward": 2.125, "reward_std": 0.5817437171936035, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.5, "step": 4604 }, { "completion_length": 140.59375, "epoch": 2.4638844301765648, "grad_norm": 1.2871170043945312, "kl": 0.14147499203681946, "learning_rate": 4.7089076342223594e-07, "loss": 0.0057, "reward": 2.0097813606262207, "reward_std": 1.0004737377166748, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.40040624141693115, "step": 4605 }, { "completion_length": 143.15625, "epoch": 2.464419475655431, "grad_norm": 0.7500649094581604, "kl": 0.16513609886169434, "learning_rate": 4.699819364699215e-07, "loss": 0.0066, "reward": 2.41015625, "reward_std": 0.7729656100273132, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48828125, "step": 4606 }, { "completion_length": 116.0625, "epoch": 2.4649545211342963, "grad_norm": 7.132961273193359, "kl": 2.279381513595581, "learning_rate": 4.6907389639150954e-07, "loss": 0.0912, "reward": 2.3643438816070557, "reward_std": 0.7387446165084839, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4893437623977661, "step": 4607 }, { "completion_length": 132.84375, "epoch": 2.465489566613162, "grad_norm": 0.8290930986404419, "kl": 0.1650804579257965, "learning_rate": 4.6816664353897314e-07, "loss": 0.0066, "reward": 2.3416249752044678, "reward_std": 0.708834707736969, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48225000500679016, "step": 4608 }, { "completion_length": 152.40625, "epoch": 2.4660246120920277, "grad_norm": 1.0633862018585205, "kl": 0.18719647824764252, "learning_rate": 4.672601782639802e-07, "loss": 0.0075, "reward": 1.5880937576293945, "reward_std": 0.5230733156204224, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47871875762939453, "step": 4609 }, { "completion_length": 154.40625, "epoch": 2.4665596575708935, "grad_norm": 1.1038392782211304, "kl": 0.21915887296199799, "learning_rate": 4.6635450091789547e-07, "loss": 0.0088, "reward": 1.383468747138977, "reward_std": 0.949716329574585, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.39909374713897705, "step": 4610 }, { "completion_length": 128.53125, "epoch": 2.467094703049759, "grad_norm": 0.9200294017791748, "kl": 0.16139854490756989, "learning_rate": 4.654496118517762e-07, "loss": 0.0065, "reward": 2.1143436431884766, "reward_std": 0.6981325149536133, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4737187623977661, "step": 4611 }, { "completion_length": 118.28125, "epoch": 2.467629748528625, "grad_norm": 2.1781861782073975, "kl": 0.3151242136955261, "learning_rate": 4.6454551141637464e-07, "loss": 0.0127, "reward": 2.768218755722046, "reward_std": 0.9258276224136353, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4713437557220459, "step": 4612 }, { "completion_length": 120.03125, "epoch": 2.4681647940074907, "grad_norm": 0.7913393378257751, "kl": 0.17897263169288635, "learning_rate": 4.636421999621368e-07, "loss": 0.0072, "reward": 2.4914374351501465, "reward_std": 0.5713418126106262, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46018749475479126, "step": 4613 }, { "completion_length": 139.8125, "epoch": 2.4686998394863564, "grad_norm": 0.7367011308670044, "kl": 0.19194123148918152, "learning_rate": 4.6273967783920485e-07, "loss": 0.0077, "reward": 1.6756563186645508, "reward_std": 0.5595607757568359, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.488156259059906, "step": 4614 }, { "completion_length": 128.25, "epoch": 2.469234884965222, "grad_norm": 247626288.0, "kl": 3134396.75, "learning_rate": 4.618379453974131e-07, "loss": 125375.8828, "reward": 1.828624963760376, "reward_std": 0.7695580720901489, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.43799999356269836, "step": 4615 }, { "completion_length": 120.4375, "epoch": 2.469769930444088, "grad_norm": 1.1063668727874756, "kl": 0.20596691966056824, "learning_rate": 4.609370029862895e-07, "loss": 0.0082, "reward": 2.9789376258850098, "reward_std": 0.8729376792907715, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4633125066757202, "step": 4616 }, { "completion_length": 104.03125, "epoch": 2.4703049759229536, "grad_norm": 0.9952802658081055, "kl": 0.16986572742462158, "learning_rate": 4.6003685095505844e-07, "loss": 0.0068, "reward": 2.631312370300293, "reward_std": 0.7353746891021729, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4906874895095825, "step": 4617 }, { "completion_length": 141.15625, "epoch": 2.470840021401819, "grad_norm": 1.023018717765808, "kl": 0.1737777590751648, "learning_rate": 4.5913748965263403e-07, "loss": 0.007, "reward": 1.59375, "reward_std": 0.6889288425445557, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.453125, "step": 4618 }, { "completion_length": 142.4375, "epoch": 2.471375066880685, "grad_norm": 1.7221887111663818, "kl": 0.2745319902896881, "learning_rate": 4.582389194276268e-07, "loss": 0.011, "reward": 1.6933749914169312, "reward_std": 0.829442024230957, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.41212499141693115, "step": 4619 }, { "completion_length": 133.625, "epoch": 2.4719101123595504, "grad_norm": 1.9198999404907227, "kl": 0.30804580450057983, "learning_rate": 4.5734114062834095e-07, "loss": 0.0123, "reward": 2.4048123359680176, "reward_std": 0.7844992876052856, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4673125147819519, "step": 4620 }, { "completion_length": 134.9375, "epoch": 2.472445157838416, "grad_norm": 7.5066237449646, "kl": 0.8269110918045044, "learning_rate": 4.5644415360277057e-07, "loss": 0.0331, "reward": 1.8551561832427979, "reward_std": 0.8193864822387695, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4176562428474426, "step": 4621 }, { "completion_length": 145.46875, "epoch": 2.472980203317282, "grad_norm": 1.8078066110610962, "kl": 0.25195518136024475, "learning_rate": 4.555479586986067e-07, "loss": 0.0101, "reward": 1.9357500076293945, "reward_std": 1.1575225591659546, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.40450000762939453, "step": 4622 }, { "completion_length": 125.1875, "epoch": 2.4735152487961476, "grad_norm": 0.8337717056274414, "kl": 0.16434234380722046, "learning_rate": 4.546525562632312e-07, "loss": 0.0066, "reward": 2.260531187057495, "reward_std": 0.5785074234008789, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.4949062466621399, "step": 4623 }, { "completion_length": 119.25, "epoch": 2.4740502942750133, "grad_norm": 2.2105908393859863, "kl": 0.3763677775859833, "learning_rate": 4.537579466437184e-07, "loss": 0.0151, "reward": 2.44921875, "reward_std": 0.6245759725570679, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48046875, "step": 4624 }, { "completion_length": 130.84375, "epoch": 2.474585339753879, "grad_norm": 0.6037742495536804, "kl": 0.20629540085792542, "learning_rate": 4.5286413018683725e-07, "loss": 0.0083, "reward": 2.1544687747955322, "reward_std": 0.8241032958030701, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48259374499320984, "step": 4625 }, { "completion_length": 136.15625, "epoch": 2.475120385232745, "grad_norm": 1.422859787940979, "kl": 0.2647138237953186, "learning_rate": 4.519711072390479e-07, "loss": 0.0106, "reward": 1.9373124837875366, "reward_std": 0.9763069152832031, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4529374837875366, "step": 4626 }, { "completion_length": 117.34375, "epoch": 2.4756554307116105, "grad_norm": 482.1604919433594, "kl": 2.3744924068450928, "learning_rate": 4.510788781465031e-07, "loss": 0.095, "reward": 2.3018436431884766, "reward_std": 0.7770917415618896, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4737187623977661, "step": 4627 }, { "completion_length": 109.375, "epoch": 2.4761904761904763, "grad_norm": 1.1202363967895508, "kl": 0.23560862243175507, "learning_rate": 4.5018744325504683e-07, "loss": 0.0094, "reward": 2.688624858856201, "reward_std": 0.7717151045799255, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4854999780654907, "step": 4628 }, { "completion_length": 94.5, "epoch": 2.476725521669342, "grad_norm": 1.1365898847579956, "kl": 0.24735911190509796, "learning_rate": 4.492968029102182e-07, "loss": 0.0099, "reward": 2.1568751335144043, "reward_std": 0.6108731031417847, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48499998450279236, "step": 4629 }, { "completion_length": 115.09375, "epoch": 2.4772605671482077, "grad_norm": 0.7387455701828003, "kl": 0.1728808879852295, "learning_rate": 4.4840695745724554e-07, "loss": 0.0069, "reward": 2.338124990463257, "reward_std": 0.580888032913208, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47874999046325684, "step": 4630 }, { "completion_length": 133.125, "epoch": 2.4777956126270735, "grad_norm": 0.6081804037094116, "kl": 0.1511373519897461, "learning_rate": 4.475179072410496e-07, "loss": 0.006, "reward": 2.4386563301086426, "reward_std": 0.7077165842056274, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4855312407016754, "step": 4631 }, { "completion_length": 114.65625, "epoch": 2.478330658105939, "grad_norm": 0.7403506636619568, "kl": 0.16624417901039124, "learning_rate": 4.466296526062441e-07, "loss": 0.0066, "reward": 3.03125, "reward_std": 0.6101624965667725, "rewards/correctness_reward_func": 1.5625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.5, "step": 4632 }, { "completion_length": 115.09375, "epoch": 2.4788657035848045, "grad_norm": 1.0848994255065918, "kl": 0.2156023532152176, "learning_rate": 4.4574219389713344e-07, "loss": 0.0086, "reward": 2.596437454223633, "reward_std": 0.870410144329071, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4558125138282776, "step": 4633 }, { "completion_length": 122.1875, "epoch": 2.4794007490636703, "grad_norm": 1.755042552947998, "kl": 0.22415851056575775, "learning_rate": 4.4485553145771243e-07, "loss": 0.009, "reward": 1.8810625076293945, "reward_std": 0.7483659982681274, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.47481250762939453, "step": 4634 }, { "completion_length": 125.375, "epoch": 2.479935794542536, "grad_norm": 1.012656569480896, "kl": 0.17566746473312378, "learning_rate": 4.4396966563167024e-07, "loss": 0.007, "reward": 2.3514060974121094, "reward_std": 0.594584584236145, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4920312464237213, "step": 4635 }, { "completion_length": 120.96875, "epoch": 2.4804708400214017, "grad_norm": 1.6073477268218994, "kl": 0.20679984986782074, "learning_rate": 4.430845967623845e-07, "loss": 0.0083, "reward": 2.7530312538146973, "reward_std": 0.6517724990844727, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.47178125381469727, "step": 4636 }, { "completion_length": 133.34375, "epoch": 2.4810058855002675, "grad_norm": 0.7305359244346619, "kl": 0.1379798948764801, "learning_rate": 4.422003251929244e-07, "loss": 0.0055, "reward": 1.7990312576293945, "reward_std": 1.0526740550994873, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.43965625762939453, "step": 4637 }, { "completion_length": 132.53125, "epoch": 2.481540930979133, "grad_norm": 0.7109506726264954, "kl": 0.16560505330562592, "learning_rate": 4.413168512660515e-07, "loss": 0.0066, "reward": 1.669281244277954, "reward_std": 0.8035553693771362, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4192812442779541, "step": 4638 }, { "completion_length": 124.5, "epoch": 2.482075976457999, "grad_norm": 1.0585814714431763, "kl": 0.192917600274086, "learning_rate": 4.4043417532421693e-07, "loss": 0.0077, "reward": 2.397125005722046, "reward_std": 0.7929887771606445, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4283750057220459, "step": 4639 }, { "completion_length": 116.75, "epoch": 2.4826110219368647, "grad_norm": 0.8677814602851868, "kl": 0.1951953023672104, "learning_rate": 4.395522977095626e-07, "loss": 0.0078, "reward": 2.2803125381469727, "reward_std": 0.6375531554222107, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.49906250834465027, "step": 4640 }, { "completion_length": 121.03125, "epoch": 2.4831460674157304, "grad_norm": 0.7457031011581421, "kl": 0.18475842475891113, "learning_rate": 4.3867121876392056e-07, "loss": 0.0074, "reward": 1.7090625762939453, "reward_std": 0.6897419691085815, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.44343751668930054, "step": 4641 }, { "completion_length": 156.75, "epoch": 2.483681112894596, "grad_norm": 0.9600867033004761, "kl": 0.16476920247077942, "learning_rate": 4.377909388288154e-07, "loss": 0.0066, "reward": 1.3151874542236328, "reward_std": 0.8972395658493042, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4245625138282776, "step": 4642 }, { "completion_length": 135.1875, "epoch": 2.484216158373462, "grad_norm": 1.6731741428375244, "kl": 0.15181894600391388, "learning_rate": 4.369114582454595e-07, "loss": 0.0061, "reward": 2.4658751487731934, "reward_std": 0.9254696369171143, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.45024996995925903, "step": 4643 }, { "completion_length": 131.625, "epoch": 2.4847512038523276, "grad_norm": 1.8528673648834229, "kl": 0.22261036932468414, "learning_rate": 4.3603277735475596e-07, "loss": 0.0089, "reward": 1.9816563129425049, "reward_std": 0.565222978591919, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4660312533378601, "step": 4644 }, { "completion_length": 129.28125, "epoch": 2.4852862493311934, "grad_norm": 0.7191504836082458, "kl": 0.17231959104537964, "learning_rate": 4.3515489649730014e-07, "loss": 0.0069, "reward": 1.9412813186645508, "reward_std": 0.9840351343154907, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4412812292575836, "step": 4645 }, { "completion_length": 135.6875, "epoch": 2.4858212948100586, "grad_norm": 0.9147541522979736, "kl": 0.12469586730003357, "learning_rate": 4.3427781601337313e-07, "loss": 0.005, "reward": 2.2569375038146973, "reward_std": 1.0802332162857056, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.44443750381469727, "step": 4646 }, { "completion_length": 148.65625, "epoch": 2.4863563402889244, "grad_norm": 0.7412762641906738, "kl": 0.14361149072647095, "learning_rate": 4.334015362429492e-07, "loss": 0.0057, "reward": 1.796875, "reward_std": 0.7474181652069092, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46875, "step": 4647 }, { "completion_length": 99.3125, "epoch": 2.48689138576779, "grad_norm": 0.6170840263366699, "kl": 0.21647077798843384, "learning_rate": 4.325260575256926e-07, "loss": 0.0087, "reward": 3.0625, "reward_std": 0.523028552532196, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 4648 }, { "completion_length": 119.53125, "epoch": 2.487426431246656, "grad_norm": 1.0562764406204224, "kl": 0.21051247417926788, "learning_rate": 4.316513802009528e-07, "loss": 0.0084, "reward": 1.5529999732971191, "reward_std": 0.7879499197006226, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.44362500309944153, "step": 4649 }, { "completion_length": 155.0625, "epoch": 2.4879614767255216, "grad_norm": 1.6658921241760254, "kl": 0.32726699113845825, "learning_rate": 4.307775046077739e-07, "loss": 0.0131, "reward": 1.87109375, "reward_std": 0.8504648208618164, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48046875, "step": 4650 }, { "completion_length": 134.96875, "epoch": 2.4884965222043873, "grad_norm": 1.0416573286056519, "kl": 0.1840878576040268, "learning_rate": 4.299044310848857e-07, "loss": 0.0074, "reward": 1.6839375495910645, "reward_std": 0.6225907206535339, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.48081251978874207, "step": 4651 }, { "completion_length": 129.65625, "epoch": 2.489031567683253, "grad_norm": 1.1073710918426514, "kl": 0.18756549060344696, "learning_rate": 4.290321599707078e-07, "loss": 0.0075, "reward": 2.34765625, "reward_std": 0.8986948728561401, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48828125, "step": 4652 }, { "completion_length": 121.5, "epoch": 2.489566613162119, "grad_norm": 1.1198545694351196, "kl": 0.20277172327041626, "learning_rate": 4.281606916033507e-07, "loss": 0.0081, "reward": 2.0374999046325684, "reward_std": 0.6375083327293396, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4749999940395355, "step": 4653 }, { "completion_length": 130.625, "epoch": 2.4901016586409845, "grad_norm": 0.7476745247840881, "kl": 0.19621770083904266, "learning_rate": 4.2729002632061145e-07, "loss": 0.0078, "reward": 2.524625062942505, "reward_std": 1.1986726522445679, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4621250033378601, "step": 4654 }, { "completion_length": 132.40625, "epoch": 2.4906367041198503, "grad_norm": 1.634040117263794, "kl": 0.15189838409423828, "learning_rate": 4.2642016445997636e-07, "loss": 0.0061, "reward": 1.87890625, "reward_std": 0.5739598274230957, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48828125, "step": 4655 }, { "completion_length": 146.375, "epoch": 2.491171749598716, "grad_norm": 1.2564618587493896, "kl": 0.1875123530626297, "learning_rate": 4.2555110635862013e-07, "loss": 0.0075, "reward": 2.0318751335144043, "reward_std": 0.6815003752708435, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46937501430511475, "step": 4656 }, { "completion_length": 146.21875, "epoch": 2.4917067950775817, "grad_norm": 1.1807544231414795, "kl": 0.23885485529899597, "learning_rate": 4.2468285235340744e-07, "loss": 0.0096, "reward": 1.291062593460083, "reward_std": 0.7489949464797974, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.43168750405311584, "step": 4657 }, { "completion_length": 135.28125, "epoch": 2.4922418405564475, "grad_norm": 2.10675048828125, "kl": 0.25029125809669495, "learning_rate": 4.2381540278088993e-07, "loss": 0.01, "reward": 1.3608124256134033, "reward_std": 0.7211305499076843, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4389374852180481, "step": 4658 }, { "completion_length": 146.46875, "epoch": 2.4927768860353128, "grad_norm": 1.299329400062561, "kl": 0.1482858955860138, "learning_rate": 4.229487579773067e-07, "loss": 0.0059, "reward": 2.1792187690734863, "reward_std": 0.6003508567810059, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46046873927116394, "step": 4659 }, { "completion_length": 119.9375, "epoch": 2.4933119315141785, "grad_norm": 0.7965656518936157, "kl": 0.22593535482883453, "learning_rate": 4.220829182785871e-07, "loss": 0.009, "reward": 2.40625, "reward_std": 0.8141578435897827, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.5, "step": 4660 }, { "completion_length": 106.03125, "epoch": 2.4938469769930443, "grad_norm": 2.3054182529449463, "kl": 0.1749270111322403, "learning_rate": 4.212178840203468e-07, "loss": 0.007, "reward": 2.554374933242798, "reward_std": 0.5655533075332642, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4918749928474426, "step": 4661 }, { "completion_length": 122.875, "epoch": 2.49438202247191, "grad_norm": 0.747475266456604, "kl": 0.15567119419574738, "learning_rate": 4.20353655537889e-07, "loss": 0.0062, "reward": 1.671875, "reward_std": 0.5901220440864563, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.5, "step": 4662 }, { "completion_length": 138.375, "epoch": 2.4949170679507757, "grad_norm": 33.471805572509766, "kl": 0.3684315085411072, "learning_rate": 4.194902331662071e-07, "loss": 0.0147, "reward": 1.7725000381469727, "reward_std": 1.026647925376892, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.41312500834465027, "step": 4663 }, { "completion_length": 146.09375, "epoch": 2.4954521134296415, "grad_norm": 0.9423152804374695, "kl": 0.2796611785888672, "learning_rate": 4.1862761723997727e-07, "loss": 0.0112, "reward": 1.6476874351501465, "reward_std": 1.1591256856918335, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.39768752455711365, "step": 4664 }, { "completion_length": 155.9375, "epoch": 2.495987158908507, "grad_norm": 1.3261091709136963, "kl": 0.16594485938549042, "learning_rate": 4.1776580809356845e-07, "loss": 0.0066, "reward": 1.2675000429153442, "reward_std": 0.7511183023452759, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.37687501311302185, "step": 4665 }, { "completion_length": 120.90625, "epoch": 2.496522204387373, "grad_norm": 1.0760871171951294, "kl": 0.16941557824611664, "learning_rate": 4.169048060610334e-07, "loss": 0.0068, "reward": 2.548281192779541, "reward_std": 0.8313737511634827, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4857812523841858, "step": 4666 }, { "completion_length": 117.9375, "epoch": 2.4970572498662387, "grad_norm": 1.6694647073745728, "kl": 0.19562216103076935, "learning_rate": 4.160446114761124e-07, "loss": 0.0078, "reward": 2.8852813243865967, "reward_std": 0.8649895191192627, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4790312647819519, "step": 4667 }, { "completion_length": 147.5, "epoch": 2.4975922953451044, "grad_norm": 88.90361022949219, "kl": 1.9424610137939453, "learning_rate": 4.151852246722346e-07, "loss": 0.0777, "reward": 1.3585624694824219, "reward_std": 0.37873998284339905, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4523124694824219, "step": 4668 }, { "completion_length": 107.78125, "epoch": 2.49812734082397, "grad_norm": 0.7208338975906372, "kl": 0.2376895397901535, "learning_rate": 4.1432664598251405e-07, "loss": 0.0095, "reward": 2.0147500038146973, "reward_std": 0.8415026664733887, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48350000381469727, "step": 4669 }, { "completion_length": 137.25, "epoch": 2.498662386302836, "grad_norm": 50246872.0, "kl": 288468.5625, "learning_rate": 4.1346887573975205e-07, "loss": 11538.7422, "reward": 1.2413125038146973, "reward_std": 0.5348560810089111, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.42881250381469727, "step": 4670 }, { "completion_length": 129.96875, "epoch": 2.4991974317817016, "grad_norm": 1.816443681716919, "kl": 0.18575340509414673, "learning_rate": 4.126119142764379e-07, "loss": 0.0074, "reward": 2.068312644958496, "reward_std": 0.7506264448165894, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47456249594688416, "step": 4671 }, { "completion_length": 135.65625, "epoch": 2.499732477260567, "grad_norm": 0.9929684400558472, "kl": 0.26851820945739746, "learning_rate": 4.117557619247456e-07, "loss": 0.0107, "reward": 1.5092812776565552, "reward_std": 0.7698047161102295, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4467812776565552, "step": 4672 }, { "completion_length": 118.0, "epoch": 2.500267522739433, "grad_norm": 1.428768515586853, "kl": 0.17183081805706024, "learning_rate": 4.109004190165361e-07, "loss": 0.0069, "reward": 2.120312452316284, "reward_std": 0.6109603643417358, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.43281251192092896, "step": 4673 }, { "completion_length": 141.65625, "epoch": 2.5008025682182984, "grad_norm": 0.8229043483734131, "kl": 0.15228378772735596, "learning_rate": 4.1004588588335615e-07, "loss": 0.0061, "reward": 2.1171562671661377, "reward_std": 0.5759580135345459, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4765312671661377, "step": 4674 }, { "completion_length": 139.78125, "epoch": 2.501337613697164, "grad_norm": 21.759029388427734, "kl": 1.531017780303955, "learning_rate": 4.091921628564405e-07, "loss": 0.0612, "reward": 1.760812520980835, "reward_std": 0.8368508815765381, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4639374911785126, "step": 4675 }, { "completion_length": 134.5, "epoch": 2.50187265917603, "grad_norm": 2.4354665279388428, "kl": 0.18081068992614746, "learning_rate": 4.0833925026670775e-07, "loss": 0.0072, "reward": 1.7651875019073486, "reward_std": 0.9588760137557983, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.46831250190734863, "step": 4676 }, { "completion_length": 134.4375, "epoch": 2.5024077046548956, "grad_norm": 0.9116227626800537, "kl": 0.1858285367488861, "learning_rate": 4.0748714844476264e-07, "loss": 0.0074, "reward": 2.313406229019165, "reward_std": 1.1492767333984375, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48528122901916504, "step": 4677 }, { "completion_length": 108.75, "epoch": 2.5029427501337613, "grad_norm": 2.0241520404815674, "kl": 0.28029072284698486, "learning_rate": 4.0663585772089757e-07, "loss": 0.0112, "reward": 2.7804064750671387, "reward_std": 0.8459458351135254, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48353123664855957, "step": 4678 }, { "completion_length": 127.46875, "epoch": 2.503477795612627, "grad_norm": 0.9660378694534302, "kl": 0.18528389930725098, "learning_rate": 4.057853784250884e-07, "loss": 0.0074, "reward": 1.9284687042236328, "reward_std": 0.8048959970474243, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.4909687638282776, "step": 4679 }, { "completion_length": 137.75, "epoch": 2.504012841091493, "grad_norm": 1.302153468132019, "kl": 0.1667584478855133, "learning_rate": 4.049357108869964e-07, "loss": 0.0067, "reward": 1.8561251163482666, "reward_std": 0.3464929461479187, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.44987499713897705, "step": 4680 }, { "completion_length": 106.3125, "epoch": 2.5045478865703585, "grad_norm": 0.8592033386230469, "kl": 0.1721809208393097, "learning_rate": 4.040868554359706e-07, "loss": 0.0069, "reward": 2.559499979019165, "reward_std": 0.6410472393035889, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4657500088214874, "step": 4681 }, { "completion_length": 122.53125, "epoch": 2.5050829320492243, "grad_norm": 1.2097491025924683, "kl": 0.20940503478050232, "learning_rate": 4.0323881240104277e-07, "loss": 0.0084, "reward": 1.7471249103546143, "reward_std": 0.8462389707565308, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.43462496995925903, "step": 4682 }, { "completion_length": 164.15625, "epoch": 2.50561797752809, "grad_norm": 1.0484987497329712, "kl": 0.2047211080789566, "learning_rate": 4.023915821109306e-07, "loss": 0.0082, "reward": 1.1335312128067017, "reward_std": 0.6072941422462463, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.38353127241134644, "step": 4683 }, { "completion_length": 128.1875, "epoch": 2.5061530230069557, "grad_norm": 1.275343656539917, "kl": 0.1668204367160797, "learning_rate": 4.0154516489403605e-07, "loss": 0.0067, "reward": 2.625, "reward_std": 0.5283679962158203, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 4684 }, { "completion_length": 111.09375, "epoch": 2.506688068485821, "grad_norm": 7.984098434448242, "kl": 0.38696998357772827, "learning_rate": 4.0069956107844836e-07, "loss": 0.0155, "reward": 1.8496249914169312, "reward_std": 0.2927725613117218, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47462499141693115, "step": 4685 }, { "completion_length": 144.21875, "epoch": 2.5072231139646872, "grad_norm": 0.9561876058578491, "kl": 0.1784745454788208, "learning_rate": 3.998547709919387e-07, "loss": 0.0071, "reward": 2.0374999046325684, "reward_std": 1.0091164112091064, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.3968749940395355, "step": 4686 }, { "completion_length": 108.0625, "epoch": 2.5077581594435525, "grad_norm": 0.9218791127204895, "kl": 0.17527621984481812, "learning_rate": 3.990107949619637e-07, "loss": 0.007, "reward": 1.94350004196167, "reward_std": 0.5908198952674866, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.45912498235702515, "step": 4687 }, { "completion_length": 109.75, "epoch": 2.5082932049224183, "grad_norm": 0.5619643330574036, "kl": 0.23050160706043243, "learning_rate": 3.981676333156659e-07, "loss": 0.0092, "reward": 2.5567500591278076, "reward_std": 0.7274917960166931, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.49424999952316284, "step": 4688 }, { "completion_length": 144.625, "epoch": 2.508828250401284, "grad_norm": 0.6705049276351929, "kl": 0.14939343929290771, "learning_rate": 3.9732528637986905e-07, "loss": 0.006, "reward": 2.19712495803833, "reward_std": 0.6796766519546509, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47837501764297485, "step": 4689 }, { "completion_length": 130.21875, "epoch": 2.5093632958801497, "grad_norm": 0.7268813252449036, "kl": 0.17128820717334747, "learning_rate": 3.96483754481084e-07, "loss": 0.0069, "reward": 2.66378116607666, "reward_std": 0.9936395883560181, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4762812554836273, "step": 4690 }, { "completion_length": 136.125, "epoch": 2.5098983413590155, "grad_norm": 0.7191043496131897, "kl": 0.18291209638118744, "learning_rate": 3.9564303794550563e-07, "loss": 0.0073, "reward": 1.7421875, "reward_std": 0.9056715965270996, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4765625, "step": 4691 }, { "completion_length": 150.03125, "epoch": 2.510433386837881, "grad_norm": 0.7785506248474121, "kl": 0.17820189893245697, "learning_rate": 3.9480313709900964e-07, "loss": 0.0071, "reward": 1.9588749408721924, "reward_std": 0.428705632686615, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.45887500047683716, "step": 4692 }, { "completion_length": 131.3125, "epoch": 2.510968432316747, "grad_norm": 1.6756492853164673, "kl": 0.20322507619857788, "learning_rate": 3.939640522671592e-07, "loss": 0.0081, "reward": 2.091031312942505, "reward_std": 0.6650626063346863, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4504062533378601, "step": 4693 }, { "completion_length": 144.75, "epoch": 2.5115034777956127, "grad_norm": 0.7996964454650879, "kl": 0.15048281848430634, "learning_rate": 3.931257837751995e-07, "loss": 0.006, "reward": 2.130187511444092, "reward_std": 0.633294939994812, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4583125114440918, "step": 4694 }, { "completion_length": 142.71875, "epoch": 2.5120385232744784, "grad_norm": 1.711870551109314, "kl": 0.18146800994873047, "learning_rate": 3.922883319480586e-07, "loss": 0.0073, "reward": 1.6253437995910645, "reward_std": 0.7898539900779724, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4222187399864197, "step": 4695 }, { "completion_length": 137.09375, "epoch": 2.512573568753344, "grad_norm": 1.2345839738845825, "kl": 0.15395528078079224, "learning_rate": 3.914516971103499e-07, "loss": 0.0062, "reward": 1.6103436946868896, "reward_std": 0.6365225315093994, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4384687542915344, "step": 4696 }, { "completion_length": 136.84375, "epoch": 2.51310861423221, "grad_norm": 2.232506036758423, "kl": 0.2137535959482193, "learning_rate": 3.9061587958636874e-07, "loss": 0.0086, "reward": 1.7992500066757202, "reward_std": 0.7573329210281372, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.3930000066757202, "step": 4697 }, { "completion_length": 122.625, "epoch": 2.513643659711075, "grad_norm": 1.1896374225616455, "kl": 0.18097208440303802, "learning_rate": 3.8978087970009406e-07, "loss": 0.0072, "reward": 2.089656352996826, "reward_std": 0.9578969478607178, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4646562337875366, "step": 4698 }, { "completion_length": 145.90625, "epoch": 2.5141787051899414, "grad_norm": 0.8311559557914734, "kl": 0.150002121925354, "learning_rate": 3.889466977751871e-07, "loss": 0.006, "reward": 1.8560625314712524, "reward_std": 0.6918398141860962, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48106250166893005, "step": 4699 }, { "completion_length": 118.5, "epoch": 2.5147137506688066, "grad_norm": 1.1358226537704468, "kl": 0.1593157798051834, "learning_rate": 3.881133341349941e-07, "loss": 0.0064, "reward": 2.7053749561309814, "reward_std": 0.9083874225616455, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4866250157356262, "step": 4700 }, { "completion_length": 132.3125, "epoch": 2.515248796147673, "grad_norm": 0.7428402900695801, "kl": 0.1499568372964859, "learning_rate": 3.872807891025418e-07, "loss": 0.006, "reward": 1.9558436870574951, "reward_std": 0.8919988870620728, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4714687466621399, "step": 4701 }, { "completion_length": 127.90625, "epoch": 2.515783841626538, "grad_norm": 0.8096319437026978, "kl": 0.18461036682128906, "learning_rate": 3.8644906300054047e-07, "loss": 0.0074, "reward": 2.1077499389648438, "reward_std": 0.7974432706832886, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4827499985694885, "step": 4702 }, { "completion_length": 149.9375, "epoch": 2.516318887105404, "grad_norm": 1.6174404621124268, "kl": 0.18541854619979858, "learning_rate": 3.8561815615138386e-07, "loss": 0.0074, "reward": 2.163562536239624, "reward_std": 1.2216967344284058, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.44481250643730164, "step": 4703 }, { "completion_length": 138.40625, "epoch": 2.5168539325842696, "grad_norm": 6.07016658782959, "kl": 0.18740440905094147, "learning_rate": 3.8478806887714716e-07, "loss": 0.0075, "reward": 2.004593849182129, "reward_std": 1.001908779144287, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47334375977516174, "step": 4704 }, { "completion_length": 142.15625, "epoch": 2.5173889780631353, "grad_norm": 1.3939400911331177, "kl": 0.2532583773136139, "learning_rate": 3.839588014995874e-07, "loss": 0.0101, "reward": 2.301687479019165, "reward_std": 0.7982462644577026, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4735625088214874, "step": 4705 }, { "completion_length": 152.5, "epoch": 2.517924023542001, "grad_norm": 0.801834762096405, "kl": 0.15923190116882324, "learning_rate": 3.8313035434014543e-07, "loss": 0.0064, "reward": 1.5382499694824219, "reward_std": 0.4597247242927551, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.38200002908706665, "step": 4706 }, { "completion_length": 128.21875, "epoch": 2.518459069020867, "grad_norm": 1.0118614435195923, "kl": 0.1413116753101349, "learning_rate": 3.8230272771994314e-07, "loss": 0.0057, "reward": 2.3340938091278076, "reward_std": 0.8754700422286987, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47471874952316284, "step": 4707 }, { "completion_length": 152.625, "epoch": 2.5189941144997325, "grad_norm": 1.4573299884796143, "kl": 0.2363920956850052, "learning_rate": 3.8147592195978354e-07, "loss": 0.0095, "reward": 1.7870625257492065, "reward_std": 0.32108283042907715, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.44331249594688416, "step": 4708 }, { "completion_length": 128.84375, "epoch": 2.5195291599785983, "grad_norm": 1.6262845993041992, "kl": 0.2181694209575653, "learning_rate": 3.806499373801537e-07, "loss": 0.0087, "reward": 2.1531875133514404, "reward_std": 0.6881246566772461, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48131251335144043, "step": 4709 }, { "completion_length": 138.40625, "epoch": 2.520064205457464, "grad_norm": 10.120092391967773, "kl": 0.5200239419937134, "learning_rate": 3.798247743012201e-07, "loss": 0.0208, "reward": 2.444312572479248, "reward_std": 0.8218584656715393, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.44431251287460327, "step": 4710 }, { "completion_length": 129.46875, "epoch": 2.5205992509363297, "grad_norm": 0.8722732663154602, "kl": 0.23074685037136078, "learning_rate": 3.7900043304283237e-07, "loss": 0.0092, "reward": 2.1615936756134033, "reward_std": 1.2390285730361938, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4428437352180481, "step": 4711 }, { "completion_length": 119.625, "epoch": 2.5211342964151955, "grad_norm": 1.2176648378372192, "kl": 0.19088110327720642, "learning_rate": 3.7817691392452004e-07, "loss": 0.0076, "reward": 2.1262500286102295, "reward_std": 0.6973518133163452, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4856249988079071, "step": 4712 }, { "completion_length": 118.375, "epoch": 2.521669341894061, "grad_norm": 1.9435598850250244, "kl": 0.27165621519088745, "learning_rate": 3.773542172654962e-07, "loss": 0.0109, "reward": 2.6609063148498535, "reward_std": 1.0878492593765259, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47340625524520874, "step": 4713 }, { "completion_length": 120.125, "epoch": 2.522204387372927, "grad_norm": 1.045148253440857, "kl": 0.19055667519569397, "learning_rate": 3.7653234338465293e-07, "loss": 0.0076, "reward": 1.7790625095367432, "reward_std": 0.7295929789543152, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48218750953674316, "step": 4714 }, { "completion_length": 116.6875, "epoch": 2.5227394328517923, "grad_norm": 3.257887125015259, "kl": 0.3430156707763672, "learning_rate": 3.757112926005643e-07, "loss": 0.0137, "reward": 2.5621249675750732, "reward_std": 0.5449588298797607, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.49962499737739563, "step": 4715 }, { "completion_length": 126.15625, "epoch": 2.523274478330658, "grad_norm": 1.0483906269073486, "kl": 0.15469497442245483, "learning_rate": 3.7489106523148663e-07, "loss": 0.0062, "reward": 2.1040310859680176, "reward_std": 0.8033292293548584, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4477812647819519, "step": 4716 }, { "completion_length": 141.96875, "epoch": 2.5238095238095237, "grad_norm": 1.0833120346069336, "kl": 0.18918181955814362, "learning_rate": 3.740716615953535e-07, "loss": 0.0076, "reward": 1.9797186851501465, "reward_std": 0.8459652662277222, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47971874475479126, "step": 4717 }, { "completion_length": 132.46875, "epoch": 2.5243445692883895, "grad_norm": 0.9802600741386414, "kl": 0.20945562422275543, "learning_rate": 3.732530820097832e-07, "loss": 0.0084, "reward": 2.216531276702881, "reward_std": 0.6936811208724976, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.46653124690055847, "step": 4718 }, { "completion_length": 129.0625, "epoch": 2.524879614767255, "grad_norm": 0.6472675800323486, "kl": 0.20251840353012085, "learning_rate": 3.724353267920724e-07, "loss": 0.0081, "reward": 2.0553126335144043, "reward_std": 0.7865055203437805, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.44593751430511475, "step": 4719 }, { "completion_length": 141.25, "epoch": 2.525414660246121, "grad_norm": 795344.4375, "kl": 1524.966064453125, "learning_rate": 3.716183962591982e-07, "loss": 60.9986, "reward": 1.493187427520752, "reward_std": 0.7424823045730591, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.44631248712539673, "step": 4720 }, { "completion_length": 132.28125, "epoch": 2.5259497057249867, "grad_norm": 1.5334415435791016, "kl": 0.2697875499725342, "learning_rate": 3.708022907278194e-07, "loss": 0.0108, "reward": 2.3508124351501465, "reward_std": 0.8401082754135132, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.47581249475479126, "step": 4721 }, { "completion_length": 120.53125, "epoch": 2.5264847512038524, "grad_norm": 0.6760824918746948, "kl": 0.16567105054855347, "learning_rate": 3.699870105142736e-07, "loss": 0.0066, "reward": 2.0044374465942383, "reward_std": 0.298616886138916, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.48881250619888306, "step": 4722 }, { "completion_length": 126.09375, "epoch": 2.527019796682718, "grad_norm": 3.0384511947631836, "kl": 0.2962270975112915, "learning_rate": 3.691725559345788e-07, "loss": 0.0118, "reward": 1.8259687423706055, "reward_std": 0.725497841835022, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.43534374237060547, "step": 4723 }, { "completion_length": 129.28125, "epoch": 2.527554842161584, "grad_norm": 2.9928784370422363, "kl": 0.2558183968067169, "learning_rate": 3.6835892730443415e-07, "loss": 0.0102, "reward": 1.5166873931884766, "reward_std": 0.6120395660400391, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4698125123977661, "step": 4724 }, { "completion_length": 129.59375, "epoch": 2.5280898876404496, "grad_norm": 1.2893781661987305, "kl": 0.15502503514289856, "learning_rate": 3.6754612493921676e-07, "loss": 0.0062, "reward": 1.4645311832427979, "reward_std": 0.6372846364974976, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4645312428474426, "step": 4725 }, { "completion_length": 146.125, "epoch": 2.528624933119315, "grad_norm": 1.397131085395813, "kl": 0.3134192228317261, "learning_rate": 3.66734149153985e-07, "loss": 0.0125, "reward": 1.7288124561309814, "reward_std": 0.4786561131477356, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4788125157356262, "step": 4726 }, { "completion_length": 120.6875, "epoch": 2.529159978598181, "grad_norm": 0.9648488163948059, "kl": 0.2845837473869324, "learning_rate": 3.6592300026347533e-07, "loss": 0.0114, "reward": 2.2877187728881836, "reward_std": 0.8149144649505615, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4752187430858612, "step": 4727 }, { "completion_length": 111.75, "epoch": 2.5296950240770464, "grad_norm": 0.4563295841217041, "kl": 0.20992808043956757, "learning_rate": 3.6511267858210576e-07, "loss": 0.0084, "reward": 3.078125, "reward_std": 0.48759162425994873, "rewards/correctness_reward_func": 1.6875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.5, "step": 4728 }, { "completion_length": 117.78125, "epoch": 2.530230069555912, "grad_norm": 0.2649252116680145, "kl": 0.19923090934753418, "learning_rate": 3.6430318442397224e-07, "loss": 0.008, "reward": 1.4370625019073486, "reward_std": 0.06729277223348618, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.49956250190734863, "step": 4729 }, { "completion_length": 109.40625, "epoch": 2.530765115034778, "grad_norm": 1.0059431791305542, "kl": 0.2096429169178009, "learning_rate": 3.6349451810284903e-07, "loss": 0.0084, "reward": 1.8084688186645508, "reward_std": 0.5808157324790955, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.495968759059906, "step": 4730 }, { "completion_length": 129.4375, "epoch": 2.5313001605136436, "grad_norm": 0.5156025290489197, "kl": 0.18635743856430054, "learning_rate": 3.626866799321929e-07, "loss": 0.0075, "reward": 1.9718124866485596, "reward_std": 0.5187233090400696, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.47181248664855957, "step": 4731 }, { "completion_length": 137.125, "epoch": 2.5318352059925093, "grad_norm": 0.641145646572113, "kl": 0.18280138075351715, "learning_rate": 3.618796702251351e-07, "loss": 0.0073, "reward": 2.521031379699707, "reward_std": 0.7082536816596985, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4585312604904175, "step": 4732 }, { "completion_length": 127.125, "epoch": 2.532370251471375, "grad_norm": 5.005784511566162, "kl": 0.4387560784816742, "learning_rate": 3.610734892944892e-07, "loss": 0.0176, "reward": 2.11928129196167, "reward_std": 0.6884886026382446, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.47865626215934753, "step": 4733 }, { "completion_length": 143.75, "epoch": 2.532905296950241, "grad_norm": 1.2154611349105835, "kl": 0.2060457468032837, "learning_rate": 3.6026813745274726e-07, "loss": 0.0082, "reward": 2.33203125, "reward_std": 0.7423326969146729, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48828125, "step": 4734 }, { "completion_length": 128.3125, "epoch": 2.5334403424291065, "grad_norm": 1.4964922666549683, "kl": 0.21241888403892517, "learning_rate": 3.5946361501207673e-07, "loss": 0.0085, "reward": 2.022312641143799, "reward_std": 0.7025336027145386, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4598124921321869, "step": 4735 }, { "completion_length": 126.28125, "epoch": 2.5339753879079723, "grad_norm": 0.7992390990257263, "kl": 0.2003585696220398, "learning_rate": 3.5865992228432815e-07, "loss": 0.008, "reward": 1.761812448501587, "reward_std": 0.8612908124923706, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4649375081062317, "step": 4736 }, { "completion_length": 126.09375, "epoch": 2.534510433386838, "grad_norm": 0.704579770565033, "kl": 0.1753414273262024, "learning_rate": 3.578570595810274e-07, "loss": 0.007, "reward": 1.7429062128067017, "reward_std": 0.4869588017463684, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.46165624260902405, "step": 4737 }, { "completion_length": 139.9375, "epoch": 2.5350454788657037, "grad_norm": 15.965436935424805, "kl": 0.22250904142856598, "learning_rate": 3.5705502721337883e-07, "loss": 0.0089, "reward": 1.831781268119812, "reward_std": 0.9639146327972412, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4567812383174896, "step": 4738 }, { "completion_length": 138.25, "epoch": 2.535580524344569, "grad_norm": 0.40621525049209595, "kl": 0.142560675740242, "learning_rate": 3.562538254922665e-07, "loss": 0.0057, "reward": 2.4125938415527344, "reward_std": 0.6967193484306335, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4594687521457672, "step": 4739 }, { "completion_length": 127.96875, "epoch": 2.5361155698234352, "grad_norm": 1.7441964149475098, "kl": 0.2058154195547104, "learning_rate": 3.554534547282512e-07, "loss": 0.0082, "reward": 2.1796875, "reward_std": 0.9842422008514404, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4765625, "step": 4740 }, { "completion_length": 145.53125, "epoch": 2.5366506153023005, "grad_norm": 1.4001917839050293, "kl": 0.15187200903892517, "learning_rate": 3.5465391523157174e-07, "loss": 0.0061, "reward": 2.734375, "reward_std": 0.711200475692749, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.5, "step": 4741 }, { "completion_length": 149.09375, "epoch": 2.5371856607811663, "grad_norm": 1.1292855739593506, "kl": 0.13685834407806396, "learning_rate": 3.538552073121454e-07, "loss": 0.0055, "reward": 1.7412188053131104, "reward_std": 0.8096403479576111, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4287187457084656, "step": 4742 }, { "completion_length": 128.40625, "epoch": 2.537720706260032, "grad_norm": 58.29304122924805, "kl": 1.6430479288101196, "learning_rate": 3.530573312795665e-07, "loss": 0.0657, "reward": 2.005000114440918, "reward_std": 0.8903298377990723, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4424999952316284, "step": 4743 }, { "completion_length": 152.21875, "epoch": 2.5382557517388977, "grad_norm": 0.5972285866737366, "kl": 0.1408812701702118, "learning_rate": 3.522602874431072e-07, "loss": 0.0056, "reward": 1.531937599182129, "reward_std": 0.6665350198745728, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.42256250977516174, "step": 4744 }, { "completion_length": 131.875, "epoch": 2.5387907972177635, "grad_norm": 0.7604286670684814, "kl": 0.15751340985298157, "learning_rate": 3.514640761117161e-07, "loss": 0.0063, "reward": 2.3198437690734863, "reward_std": 0.33620136976242065, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.49171876907348633, "step": 4745 }, { "completion_length": 122.46875, "epoch": 2.539325842696629, "grad_norm": 1.8115438222885132, "kl": 0.3308871388435364, "learning_rate": 3.506686975940213e-07, "loss": 0.0132, "reward": 2.1657187938690186, "reward_std": 0.9255882501602173, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4938437342643738, "step": 4746 }, { "completion_length": 117.4375, "epoch": 2.539860888175495, "grad_norm": 0.907682478427887, "kl": 0.16101999580860138, "learning_rate": 3.4987415219832596e-07, "loss": 0.0064, "reward": 2.075937509536743, "reward_std": 0.8526684641838074, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48218750953674316, "step": 4747 }, { "completion_length": 134.3125, "epoch": 2.5403959336543607, "grad_norm": 0.8527546525001526, "kl": 0.19805528223514557, "learning_rate": 3.490804402326106e-07, "loss": 0.0079, "reward": 1.4482812881469727, "reward_std": 0.6872881650924683, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.43265628814697266, "step": 4748 }, { "completion_length": 111.65625, "epoch": 2.5409309791332264, "grad_norm": 1.94907546043396, "kl": 0.20406663417816162, "learning_rate": 3.482875620045348e-07, "loss": 0.0082, "reward": 2.327812433242798, "reward_std": 0.832241415977478, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4996874928474426, "step": 4749 }, { "completion_length": 140.28125, "epoch": 2.541466024612092, "grad_norm": 1.584962010383606, "kl": 0.1461794376373291, "learning_rate": 3.4749551782143206e-07, "loss": 0.0058, "reward": 1.373687505722046, "reward_std": 0.46948572993278503, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4674375057220459, "step": 4750 }, { "completion_length": 126.1875, "epoch": 2.542001070090958, "grad_norm": 26.451478958129883, "kl": 0.9645400643348694, "learning_rate": 3.4670430799031403e-07, "loss": 0.0386, "reward": 1.7102500200271606, "reward_std": 0.7537509202957153, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.47587502002716064, "step": 4751 }, { "completion_length": 119.96875, "epoch": 2.542536115569823, "grad_norm": 0.726107120513916, "kl": 0.17563873529434204, "learning_rate": 3.4591393281786926e-07, "loss": 0.007, "reward": 2.444499969482422, "reward_std": 0.8688093423843384, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.44449999928474426, "step": 4752 }, { "completion_length": 110.0, "epoch": 2.5430711610486894, "grad_norm": 1.3604161739349365, "kl": 0.22580264508724213, "learning_rate": 3.4512439261046195e-07, "loss": 0.009, "reward": 2.118406295776367, "reward_std": 0.9022918939590454, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4621562361717224, "step": 4753 }, { "completion_length": 131.34375, "epoch": 2.5436062065275546, "grad_norm": 1.635473370552063, "kl": 0.18044263124465942, "learning_rate": 3.443356876741333e-07, "loss": 0.0072, "reward": 2.1336874961853027, "reward_std": 0.5037642121315002, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.39931249618530273, "step": 4754 }, { "completion_length": 135.875, "epoch": 2.5441412520064204, "grad_norm": 1.9027355909347534, "kl": 0.20919106900691986, "learning_rate": 3.435478183145999e-07, "loss": 0.0084, "reward": 1.6132187843322754, "reward_std": 0.9687417149543762, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4569687247276306, "step": 4755 }, { "completion_length": 139.53125, "epoch": 2.544676297485286, "grad_norm": 0.440214604139328, "kl": 0.16257914900779724, "learning_rate": 3.427607848372555e-07, "loss": 0.0065, "reward": 2.5606250762939453, "reward_std": 0.6071674823760986, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.45125001668930054, "step": 4756 }, { "completion_length": 123.09375, "epoch": 2.545211342964152, "grad_norm": 0.976713240146637, "kl": 0.31779465079307556, "learning_rate": 3.4197458754716947e-07, "loss": 0.0127, "reward": 2.0776875019073486, "reward_std": 0.7497450113296509, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46831250190734863, "step": 4757 }, { "completion_length": 99.75, "epoch": 2.5457463884430176, "grad_norm": 0.3346754312515259, "kl": 0.16422037780284882, "learning_rate": 3.411892267490863e-07, "loss": 0.0066, "reward": 3.0, "reward_std": 0.24945424497127533, "rewards/correctness_reward_func": 1.5625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.5, "step": 4758 }, { "completion_length": 121.78125, "epoch": 2.5462814339218833, "grad_norm": 0.7401842474937439, "kl": 0.1868087500333786, "learning_rate": 3.404047027474283e-07, "loss": 0.0075, "reward": 2.613187313079834, "reward_std": 0.724839985370636, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4725624918937683, "step": 4759 }, { "completion_length": 149.59375, "epoch": 2.546816479400749, "grad_norm": 1.257609248161316, "kl": 0.17161527276039124, "learning_rate": 3.3962101584628983e-07, "loss": 0.0069, "reward": 1.828125, "reward_std": 0.8157760500907898, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.453125, "step": 4760 }, { "completion_length": 111.875, "epoch": 2.547351524879615, "grad_norm": 1.8318310976028442, "kl": 0.2008979320526123, "learning_rate": 3.388381663494442e-07, "loss": 0.008, "reward": 2.7492499351501465, "reward_std": 0.5700373649597168, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.49925002455711365, "step": 4761 }, { "completion_length": 136.9375, "epoch": 2.5478865703584805, "grad_norm": 0.7774658203125, "kl": 0.1467021256685257, "learning_rate": 3.3805615456033974e-07, "loss": 0.0059, "reward": 2.360499858856201, "reward_std": 0.7778060436248779, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.4073749780654907, "step": 4762 }, { "completion_length": 133.75, "epoch": 2.5484216158373463, "grad_norm": 0.9425368905067444, "kl": 0.19740375876426697, "learning_rate": 3.372749807820974e-07, "loss": 0.0079, "reward": 2.069093704223633, "reward_std": 0.4945266842842102, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4440937638282776, "step": 4763 }, { "completion_length": 124.46875, "epoch": 2.548956661316212, "grad_norm": 2.9941658973693848, "kl": 0.19782444834709167, "learning_rate": 3.3649464531751637e-07, "loss": 0.0079, "reward": 2.21484375, "reward_std": 0.4977508783340454, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46484375, "step": 4764 }, { "completion_length": 146.84375, "epoch": 2.5494917067950773, "grad_norm": 1.4438210725784302, "kl": 0.18529817461967468, "learning_rate": 3.3571514846906934e-07, "loss": 0.0074, "reward": 1.7878749370574951, "reward_std": 1.256537675857544, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.28125, "rewards/xmlcount_reward_func": 0.3659999966621399, "step": 4765 }, { "completion_length": 137.96875, "epoch": 2.5500267522739435, "grad_norm": 0.7532020807266235, "kl": 0.291342556476593, "learning_rate": 3.3493649053890325e-07, "loss": 0.0117, "reward": 1.9950624704360962, "reward_std": 1.1991889476776123, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4638125002384186, "step": 4766 }, { "completion_length": 102.875, "epoch": 2.550561797752809, "grad_norm": 1.0394216775894165, "kl": 0.2988191843032837, "learning_rate": 3.3415867182884256e-07, "loss": 0.012, "reward": 2.0922188758850098, "reward_std": 0.8908177614212036, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4672187566757202, "step": 4767 }, { "completion_length": 98.09375, "epoch": 2.551096843231675, "grad_norm": 1.633399248123169, "kl": 0.19735831022262573, "learning_rate": 3.333816926403838e-07, "loss": 0.0079, "reward": 3.160656213760376, "reward_std": 0.6276801228523254, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48878124356269836, "step": 4768 }, { "completion_length": 121.34375, "epoch": 2.5516318887105403, "grad_norm": 1.2068012952804565, "kl": 0.407413125038147, "learning_rate": 3.326055532746991e-07, "loss": 0.0163, "reward": 2.405437469482422, "reward_std": 1.179656744003296, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.45231249928474426, "step": 4769 }, { "completion_length": 123.21875, "epoch": 2.552166934189406, "grad_norm": 0.580380916595459, "kl": 0.1739131659269333, "learning_rate": 3.318302540326343e-07, "loss": 0.007, "reward": 1.9589688777923584, "reward_std": 0.8745176792144775, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.44334375858306885, "step": 4770 }, { "completion_length": 132.59375, "epoch": 2.5527019796682717, "grad_norm": 0.9321470856666565, "kl": 0.2096346765756607, "learning_rate": 3.310557952147114e-07, "loss": 0.0084, "reward": 2.4759373664855957, "reward_std": 0.902610719203949, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.49156248569488525, "step": 4771 }, { "completion_length": 131.6875, "epoch": 2.5532370251471375, "grad_norm": 1.6686019897460938, "kl": 0.22613924741744995, "learning_rate": 3.3028217712112484e-07, "loss": 0.009, "reward": 1.924625039100647, "reward_std": 0.9805184006690979, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4090000092983246, "step": 4772 }, { "completion_length": 125.25, "epoch": 2.553772070626003, "grad_norm": 10.02044677734375, "kl": 0.38795486092567444, "learning_rate": 3.295094000517432e-07, "loss": 0.0155, "reward": 2.240906238555908, "reward_std": 1.111351490020752, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4752812385559082, "step": 4773 }, { "completion_length": 140.78125, "epoch": 2.554307116104869, "grad_norm": 1.4516719579696655, "kl": 0.22141635417938232, "learning_rate": 3.287374643061111e-07, "loss": 0.0089, "reward": 1.8186249732971191, "reward_std": 0.6072468161582947, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.39675000309944153, "step": 4774 }, { "completion_length": 121.15625, "epoch": 2.5548421615837347, "grad_norm": 0.38105911016464233, "kl": 0.1452609896659851, "learning_rate": 3.2796637018344484e-07, "loss": 0.0058, "reward": 1.7541874647140503, "reward_std": 0.5216787457466125, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4885624945163727, "step": 4775 }, { "completion_length": 135.9375, "epoch": 2.5553772070626004, "grad_norm": 1.2555724382400513, "kl": 0.17949530482292175, "learning_rate": 3.2719611798263494e-07, "loss": 0.0072, "reward": 1.7030000686645508, "reward_std": 0.5004181861877441, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.499875009059906, "step": 4776 }, { "completion_length": 121.90625, "epoch": 2.555912252541466, "grad_norm": 0.7563667297363281, "kl": 0.18276560306549072, "learning_rate": 3.2642670800224735e-07, "loss": 0.0073, "reward": 2.765500068664551, "reward_std": 0.6515883207321167, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.484250009059906, "step": 4777 }, { "completion_length": 141.375, "epoch": 2.556447298020332, "grad_norm": 1.3842765092849731, "kl": 0.23749035596847534, "learning_rate": 3.256581405405179e-07, "loss": 0.0095, "reward": 1.5641250610351562, "reward_std": 0.5475702285766602, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4235000014305115, "step": 4778 }, { "completion_length": 163.375, "epoch": 2.5569823434991976, "grad_norm": 0.625377893447876, "kl": 0.1273825615644455, "learning_rate": 3.248904158953595e-07, "loss": 0.0051, "reward": 1.8578437566757202, "reward_std": 1.0639851093292236, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4047187566757202, "step": 4779 }, { "completion_length": 109.25, "epoch": 2.557517388978063, "grad_norm": 0.6894150376319885, "kl": 0.18845313787460327, "learning_rate": 3.2412353436435755e-07, "loss": 0.0075, "reward": 2.718437433242798, "reward_std": 0.6103979349136353, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4996874928474426, "step": 4780 }, { "completion_length": 131.875, "epoch": 2.558052434456929, "grad_norm": 1.4564372301101685, "kl": 0.16357383131980896, "learning_rate": 3.2335749624476804e-07, "loss": 0.0065, "reward": 2.50362491607666, "reward_std": 0.8026251792907715, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4880000054836273, "step": 4781 }, { "completion_length": 167.15625, "epoch": 2.5585874799357944, "grad_norm": 0.6597561836242676, "kl": 0.13866648077964783, "learning_rate": 3.2259230183352363e-07, "loss": 0.0055, "reward": 1.207437515258789, "reward_std": 0.8565574884414673, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.203125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.41056251525878906, "step": 4782 }, { "completion_length": 128.59375, "epoch": 2.55912252541466, "grad_norm": 0.4228416681289673, "kl": 0.1491088718175888, "learning_rate": 3.2182795142722796e-07, "loss": 0.006, "reward": 2.3301875591278076, "reward_std": 0.31569403409957886, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.47081249952316284, "step": 4783 }, { "completion_length": 133.96875, "epoch": 2.559657570893526, "grad_norm": 1.5369794368743896, "kl": 0.21559906005859375, "learning_rate": 3.210644453221573e-07, "loss": 0.0086, "reward": 2.2879061698913574, "reward_std": 1.093303918838501, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4597812592983246, "step": 4784 }, { "completion_length": 158.1875, "epoch": 2.5601926163723916, "grad_norm": 1.830809473991394, "kl": 0.18945065140724182, "learning_rate": 3.203017838142622e-07, "loss": 0.0076, "reward": 1.404593825340271, "reward_std": 0.8285512924194336, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.40459373593330383, "step": 4785 }, { "completion_length": 126.96875, "epoch": 2.5607276618512573, "grad_norm": 2.262535333633423, "kl": 0.20801687240600586, "learning_rate": 3.1953996719916445e-07, "loss": 0.0083, "reward": 2.2134687900543213, "reward_std": 0.8274169564247131, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4947187602519989, "step": 4786 }, { "completion_length": 109.53125, "epoch": 2.561262707330123, "grad_norm": 1.0505934953689575, "kl": 0.26221561431884766, "learning_rate": 3.1877899577215874e-07, "loss": 0.0105, "reward": 2.548156261444092, "reward_std": 0.506241500377655, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4700312614440918, "step": 4787 }, { "completion_length": 137.78125, "epoch": 2.561797752808989, "grad_norm": 0.7290158867835999, "kl": 0.16419222950935364, "learning_rate": 3.180188698282116e-07, "loss": 0.0066, "reward": 1.4023125171661377, "reward_std": 0.47511011362075806, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4648125171661377, "step": 4788 }, { "completion_length": 122.125, "epoch": 2.5623327982878545, "grad_norm": 1.2783591747283936, "kl": 0.20638862252235413, "learning_rate": 3.1725958966196346e-07, "loss": 0.0083, "reward": 2.397531270980835, "reward_std": 0.734356164932251, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4756562411785126, "step": 4789 }, { "completion_length": 146.0, "epoch": 2.5628678437667203, "grad_norm": 0.8221173882484436, "kl": 0.17135313153266907, "learning_rate": 3.1650115556772554e-07, "loss": 0.0069, "reward": 1.6217501163482666, "reward_std": 1.111527919769287, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.38737499713897705, "step": 4790 }, { "completion_length": 112.46875, "epoch": 2.563402889245586, "grad_norm": 0.8586945533752441, "kl": 0.20827066898345947, "learning_rate": 3.1574356783948045e-07, "loss": 0.0083, "reward": 2.7546563148498535, "reward_std": 0.6137493252754211, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47340625524520874, "step": 4791 }, { "completion_length": 126.5625, "epoch": 2.5639379347244518, "grad_norm": 0.5147752165794373, "kl": 0.16962100565433502, "learning_rate": 3.149868267708853e-07, "loss": 0.0068, "reward": 2.596843719482422, "reward_std": 0.7783972024917603, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48746874928474426, "step": 4792 }, { "completion_length": 120.5, "epoch": 2.564472980203317, "grad_norm": 0.8812384009361267, "kl": 0.25963398814201355, "learning_rate": 3.1423093265526646e-07, "loss": 0.0104, "reward": 2.038249969482422, "reward_std": 0.5581985712051392, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.44450002908706665, "step": 4793 }, { "completion_length": 158.78125, "epoch": 2.5650080256821832, "grad_norm": 1.615173101425171, "kl": 0.27248892188072205, "learning_rate": 3.1347588578562293e-07, "loss": 0.0109, "reward": 1.7335624694824219, "reward_std": 0.9775846600532532, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.42106252908706665, "step": 4794 }, { "completion_length": 96.28125, "epoch": 2.5655430711610485, "grad_norm": 1.75906240940094, "kl": 0.3186013698577881, "learning_rate": 3.127216864546259e-07, "loss": 0.0127, "reward": 2.6947813034057617, "reward_std": 0.7644572257995605, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.49165624380111694, "step": 4795 }, { "completion_length": 133.6875, "epoch": 2.5660781166399143, "grad_norm": 0.9870030283927917, "kl": 0.23916082084178925, "learning_rate": 3.119683349546171e-07, "loss": 0.0096, "reward": 2.232656240463257, "reward_std": 0.6514589190483093, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.48265624046325684, "step": 4796 }, { "completion_length": 137.875, "epoch": 2.56661316211878, "grad_norm": 3.204596757888794, "kl": 0.1706799864768982, "learning_rate": 3.1121583157761037e-07, "loss": 0.0068, "reward": 1.8103125095367432, "reward_std": 0.7336111068725586, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.43531250953674316, "step": 4797 }, { "completion_length": 133.65625, "epoch": 2.5671482075976457, "grad_norm": 1.129596471786499, "kl": 0.19066333770751953, "learning_rate": 3.104641766152894e-07, "loss": 0.0076, "reward": 1.5910937786102295, "reward_std": 0.6181017160415649, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4504687786102295, "step": 4798 }, { "completion_length": 126.0625, "epoch": 2.5676832530765115, "grad_norm": 1.6138474941253662, "kl": 0.22592277824878693, "learning_rate": 3.0971337035901135e-07, "loss": 0.009, "reward": 1.8624687194824219, "reward_std": 0.7406941056251526, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.48746874928474426, "step": 4799 }, { "completion_length": 118.375, "epoch": 2.568218298555377, "grad_norm": 2.0723257064819336, "kl": 0.28338536620140076, "learning_rate": 3.089634130998026e-07, "loss": 0.0113, "reward": 1.3395624160766602, "reward_std": 0.4795537292957306, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.140625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4489375352859497, "step": 4800 }, { "completion_length": 131.96875, "epoch": 2.568753344034243, "grad_norm": 9.10276985168457, "kl": 1.094050645828247, "learning_rate": 3.0821430512836036e-07, "loss": 0.0438, "reward": 2.151750087738037, "reward_std": 0.5889893770217896, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.43299996852874756, "step": 4801 }, { "completion_length": 149.09375, "epoch": 2.5692883895131087, "grad_norm": 1.2040045261383057, "kl": 0.16446256637573242, "learning_rate": 3.0746604673505486e-07, "loss": 0.0066, "reward": 1.6226249933242798, "reward_std": 1.08882474899292, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.4194999933242798, "step": 4802 }, { "completion_length": 125.28125, "epoch": 2.5698234349919744, "grad_norm": 1.5504121780395508, "kl": 0.15673749148845673, "learning_rate": 3.0671863820992326e-07, "loss": 0.0063, "reward": 1.7109375, "reward_std": 0.7658188939094543, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4765625, "step": 4803 }, { "completion_length": 170.21875, "epoch": 2.57035848047084, "grad_norm": 1.2650046348571777, "kl": 0.1724824607372284, "learning_rate": 3.059720798426763e-07, "loss": 0.0069, "reward": 1.359375, "reward_std": 0.7814817428588867, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.421875, "step": 4804 }, { "completion_length": 151.875, "epoch": 2.570893525949706, "grad_norm": 0.8518446087837219, "kl": 0.16078509390354156, "learning_rate": 3.052263719226958e-07, "loss": 0.0064, "reward": 2.0029373168945312, "reward_std": 0.786677360534668, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4560624957084656, "step": 4805 }, { "completion_length": 123.46875, "epoch": 2.571428571428571, "grad_norm": 2096.764404296875, "kl": 1.6489992141723633, "learning_rate": 3.044815147390301e-07, "loss": 0.066, "reward": 1.631250023841858, "reward_std": 0.567301869392395, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4906250238418579, "step": 4806 }, { "completion_length": 142.90625, "epoch": 2.5719636169074374, "grad_norm": 0.6382502317428589, "kl": 0.19452229142189026, "learning_rate": 3.0373750858040175e-07, "loss": 0.0078, "reward": 1.6422500610351562, "reward_std": 0.6030337810516357, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4703750014305115, "step": 4807 }, { "completion_length": 156.1875, "epoch": 2.5724986623863026, "grad_norm": 0.8569088578224182, "kl": 0.13920657336711884, "learning_rate": 3.0299435373520094e-07, "loss": 0.0056, "reward": 1.8084688186645508, "reward_std": 0.6464462280273438, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.417843759059906, "step": 4808 }, { "completion_length": 112.1875, "epoch": 2.5730337078651684, "grad_norm": 1.775011420249939, "kl": 0.23424896597862244, "learning_rate": 3.022520504914886e-07, "loss": 0.0094, "reward": 2.26953125, "reward_std": 0.5297657251358032, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.48828125, "step": 4809 }, { "completion_length": 123.4375, "epoch": 2.573568753344034, "grad_norm": 0.7680403590202332, "kl": 0.2018153965473175, "learning_rate": 3.0151059913699684e-07, "loss": 0.0081, "reward": 2.213124990463257, "reward_std": 0.9320857524871826, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.47874999046325684, "step": 4810 }, { "completion_length": 140.625, "epoch": 2.5741037988229, "grad_norm": 0.878387451171875, "kl": 0.16558325290679932, "learning_rate": 3.0076999995912543e-07, "loss": 0.0066, "reward": 2.187812566757202, "reward_std": 0.9394581317901611, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4534375071525574, "step": 4811 }, { "completion_length": 106.09375, "epoch": 2.5746388443017656, "grad_norm": 0.6212547421455383, "kl": 0.15675532817840576, "learning_rate": 3.000302532449445e-07, "loss": 0.0063, "reward": 1.8787813186645508, "reward_std": 0.2326778769493103, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.488156259059906, "step": 4812 }, { "completion_length": 142.65625, "epoch": 2.5751738897806313, "grad_norm": 1.1464401483535767, "kl": 0.15757587552070618, "learning_rate": 2.9929135928119496e-07, "loss": 0.0063, "reward": 2.140625, "reward_std": 1.2500951290130615, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.453125, "step": 4813 }, { "completion_length": 127.21875, "epoch": 2.575708935259497, "grad_norm": 0.892894446849823, "kl": 0.20677775144577026, "learning_rate": 2.9855331835428604e-07, "loss": 0.0083, "reward": 2.260125160217285, "reward_std": 0.7034406661987305, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4632500112056732, "step": 4814 }, { "completion_length": 136.375, "epoch": 2.576243980738363, "grad_norm": 3.0752813816070557, "kl": 0.4163013994693756, "learning_rate": 2.978161307502964e-07, "loss": 0.0167, "reward": 2.118781089782715, "reward_std": 0.7566046714782715, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46253126859664917, "step": 4815 }, { "completion_length": 136.75, "epoch": 2.5767790262172285, "grad_norm": 1.8159879446029663, "kl": 0.24284884333610535, "learning_rate": 2.9707979675497377e-07, "loss": 0.0097, "reward": 2.23828125, "reward_std": 0.7182356119155884, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48828125, "step": 4816 }, { "completion_length": 124.25, "epoch": 2.5773140716960943, "grad_norm": 1.2621229887008667, "kl": 0.18814769387245178, "learning_rate": 2.963443166537358e-07, "loss": 0.0075, "reward": 2.0190625190734863, "reward_std": 0.7536734342575073, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.45656248927116394, "step": 4817 }, { "completion_length": 142.4375, "epoch": 2.57784911717496, "grad_norm": 0.6149274706840515, "kl": 0.2243792712688446, "learning_rate": 2.9560969073166857e-07, "loss": 0.009, "reward": 1.9780625104904175, "reward_std": 0.6303946375846863, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4311875104904175, "step": 4818 }, { "completion_length": 137.125, "epoch": 2.5783841626538253, "grad_norm": 2.71390700340271, "kl": 0.21418774127960205, "learning_rate": 2.9487591927352676e-07, "loss": 0.0086, "reward": 1.7174062728881836, "reward_std": 0.8227350115776062, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4205312430858612, "step": 4819 }, { "completion_length": 134.5, "epoch": 2.5789192081326915, "grad_norm": 2.9344117641448975, "kl": 0.16873878240585327, "learning_rate": 2.9414300256373483e-07, "loss": 0.0067, "reward": 2.498781204223633, "reward_std": 1.058793306350708, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4675312638282776, "step": 4820 }, { "completion_length": 139.1875, "epoch": 2.579454253611557, "grad_norm": 0.422776997089386, "kl": 0.17820331454277039, "learning_rate": 2.9341094088638525e-07, "loss": 0.0071, "reward": 2.140812397003174, "reward_std": 0.6673548221588135, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.453312486410141, "step": 4821 }, { "completion_length": 139.6875, "epoch": 2.579989299090423, "grad_norm": 1.7020834684371948, "kl": 0.19916540384292603, "learning_rate": 2.9267973452523855e-07, "loss": 0.008, "reward": 1.2330937385559082, "reward_std": 0.672011137008667, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.4049687385559082, "step": 4822 }, { "completion_length": 148.625, "epoch": 2.5805243445692883, "grad_norm": 0.7486924529075623, "kl": 0.16069647669792175, "learning_rate": 2.9194938376372525e-07, "loss": 0.0064, "reward": 1.6586874723434448, "reward_std": 0.888850212097168, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4243125021457672, "step": 4823 }, { "completion_length": 110.5625, "epoch": 2.581059390048154, "grad_norm": 0.8022189736366272, "kl": 0.299625039100647, "learning_rate": 2.9121988888494297e-07, "loss": 0.012, "reward": 2.330124855041504, "reward_std": 0.8151822090148926, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48637500405311584, "step": 4824 }, { "completion_length": 136.0, "epoch": 2.5815944355270197, "grad_norm": 1.2133455276489258, "kl": 0.2260652780532837, "learning_rate": 2.904912501716575e-07, "loss": 0.009, "reward": 1.9696249961853027, "reward_std": 1.1293843984603882, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.46962499618530273, "step": 4825 }, { "completion_length": 137.09375, "epoch": 2.5821294810058855, "grad_norm": 2.4284088611602783, "kl": 0.1440749168395996, "learning_rate": 2.89763467906303e-07, "loss": 0.0058, "reward": 1.930999994277954, "reward_std": 0.6934002637863159, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4622499942779541, "step": 4826 }, { "completion_length": 138.84375, "epoch": 2.582664526484751, "grad_norm": 1.242531657218933, "kl": 0.18013262748718262, "learning_rate": 2.890365423709826e-07, "loss": 0.0072, "reward": 2.199718713760376, "reward_std": 0.9752111434936523, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.43409374356269836, "step": 4827 }, { "completion_length": 138.34375, "epoch": 2.583199571963617, "grad_norm": 1.1508171558380127, "kl": 0.17873208224773407, "learning_rate": 2.883104738474665e-07, "loss": 0.0071, "reward": 2.263000011444092, "reward_std": 0.9996182918548584, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4505000114440918, "step": 4828 }, { "completion_length": 127.4375, "epoch": 2.5837346174424827, "grad_norm": 1.0300050973892212, "kl": 0.17204977571964264, "learning_rate": 2.8758526261719184e-07, "loss": 0.0069, "reward": 1.5091562271118164, "reward_std": 0.5034257769584656, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4622812569141388, "step": 4829 }, { "completion_length": 109.5625, "epoch": 2.5842696629213484, "grad_norm": 14.22237777709961, "kl": 1.1911489963531494, "learning_rate": 2.868609089612659e-07, "loss": 0.0476, "reward": 1.8660937547683716, "reward_std": 0.7801907062530518, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4598437547683716, "step": 4830 }, { "completion_length": 146.71875, "epoch": 2.584804708400214, "grad_norm": 2.1268234252929688, "kl": 0.49509522318840027, "learning_rate": 2.861374131604605e-07, "loss": 0.0198, "reward": 1.8864061832427979, "reward_std": 1.0176293849945068, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4332812428474426, "step": 4831 }, { "completion_length": 137.09375, "epoch": 2.58533975387908, "grad_norm": 1.684177041053772, "kl": 0.21646398305892944, "learning_rate": 2.8541477549521746e-07, "loss": 0.0087, "reward": 1.9279375076293945, "reward_std": 0.8553544282913208, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.42793750762939453, "step": 4832 }, { "completion_length": 144.53125, "epoch": 2.5858747993579456, "grad_norm": 1.9791252613067627, "kl": 0.17304351925849915, "learning_rate": 2.846929962456446e-07, "loss": 0.0069, "reward": 1.8077812194824219, "reward_std": 0.9493666887283325, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.44840624928474426, "step": 4833 }, { "completion_length": 123.0, "epoch": 2.586409844836811, "grad_norm": 1.3346514701843262, "kl": 0.19425471127033234, "learning_rate": 2.839720756915171e-07, "loss": 0.0078, "reward": 2.0329999923706055, "reward_std": 0.7621484994888306, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48612499237060547, "step": 4834 }, { "completion_length": 122.96875, "epoch": 2.586944890315677, "grad_norm": 2.0845377445220947, "kl": 0.22498102486133575, "learning_rate": 2.8325201411227824e-07, "loss": 0.009, "reward": 1.854968786239624, "reward_std": 0.943342924118042, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46434375643730164, "step": 4835 }, { "completion_length": 131.28125, "epoch": 2.5874799357945424, "grad_norm": 6.88124942779541, "kl": 0.31013867259025574, "learning_rate": 2.8253281178703767e-07, "loss": 0.0124, "reward": 3.0625, "reward_std": 0.8437830209732056, "rewards/correctness_reward_func": 1.5625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 4836 }, { "completion_length": 129.34375, "epoch": 2.588014981273408, "grad_norm": 0.7296711206436157, "kl": 0.18641707301139832, "learning_rate": 2.818144689945712e-07, "loss": 0.0075, "reward": 1.7614688873291016, "reward_std": 0.5029410123825073, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.464593768119812, "step": 4837 }, { "completion_length": 149.84375, "epoch": 2.588550026752274, "grad_norm": 0.9449751973152161, "kl": 0.16972409188747406, "learning_rate": 2.8109698601332306e-07, "loss": 0.0068, "reward": 1.7296563386917114, "reward_std": 0.6484040021896362, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.44840624928474426, "step": 4838 }, { "completion_length": 134.78125, "epoch": 2.5890850722311396, "grad_norm": 1.4209866523742676, "kl": 0.21760569512844086, "learning_rate": 2.8038036312140335e-07, "loss": 0.0087, "reward": 2.2317187786102295, "reward_std": 0.8501752614974976, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4660937488079071, "step": 4839 }, { "completion_length": 131.84375, "epoch": 2.5896201177100053, "grad_norm": 2.4688780307769775, "kl": 0.23876738548278809, "learning_rate": 2.7966460059658835e-07, "loss": 0.0096, "reward": 1.905093789100647, "reward_std": 0.7189099788665771, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4675937592983246, "step": 4840 }, { "completion_length": 138.4375, "epoch": 2.590155163188871, "grad_norm": 1.165601134300232, "kl": 0.140963613986969, "learning_rate": 2.789496987163212e-07, "loss": 0.0056, "reward": 2.1089062690734863, "reward_std": 0.7973887920379639, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46828126907348633, "step": 4841 }, { "completion_length": 120.125, "epoch": 2.590690208667737, "grad_norm": 0.9577881693840027, "kl": 0.17682096362113953, "learning_rate": 2.782356577577125e-07, "loss": 0.0071, "reward": 2.640625, "reward_std": 0.8995460867881775, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.5, "step": 4842 }, { "completion_length": 139.78125, "epoch": 2.5912252541466025, "grad_norm": 1.5567331314086914, "kl": 0.28421011567115784, "learning_rate": 2.7752247799753773e-07, "loss": 0.0114, "reward": 2.3908748626708984, "reward_std": 1.0796295404434204, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4533750116825104, "step": 4843 }, { "completion_length": 126.21875, "epoch": 2.5917602996254683, "grad_norm": 3.631819009780884, "kl": 0.389081746339798, "learning_rate": 2.7681015971223856e-07, "loss": 0.0156, "reward": 2.2207813262939453, "reward_std": 0.71424800157547, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.48640626668930054, "step": 4844 }, { "completion_length": 138.46875, "epoch": 2.592295345104334, "grad_norm": 1.4264483451843262, "kl": 0.2853081524372101, "learning_rate": 2.760987031779239e-07, "loss": 0.0114, "reward": 1.684531331062317, "reward_std": 0.8004500269889832, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4501562714576721, "step": 4845 }, { "completion_length": 132.03125, "epoch": 2.5928303905831998, "grad_norm": 0.45597636699676514, "kl": 0.16142256557941437, "learning_rate": 2.7538810867036776e-07, "loss": 0.0065, "reward": 3.166562557220459, "reward_std": 0.6342766880989075, "rewards/correctness_reward_func": 1.6875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.4946874976158142, "step": 4846 }, { "completion_length": 152.15625, "epoch": 2.593365436062065, "grad_norm": 0.6251595616340637, "kl": 0.16634024679660797, "learning_rate": 2.7467837646500934e-07, "loss": 0.0067, "reward": 2.019656181335449, "reward_std": 0.763243556022644, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.441531240940094, "step": 4847 }, { "completion_length": 139.71875, "epoch": 2.5939004815409312, "grad_norm": 0.7590245008468628, "kl": 0.15392258763313293, "learning_rate": 2.7396950683695634e-07, "loss": 0.0062, "reward": 1.778656244277954, "reward_std": 0.979529082775116, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4192812442779541, "step": 4848 }, { "completion_length": 124.15625, "epoch": 2.5944355270197965, "grad_norm": 1.0420085191726685, "kl": 0.18186306953430176, "learning_rate": 2.732615000609781e-07, "loss": 0.0073, "reward": 2.59375, "reward_std": 0.4789544939994812, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.5, "step": 4849 }, { "completion_length": 116.8125, "epoch": 2.5949705724986623, "grad_norm": 2.053834915161133, "kl": 0.21445037424564362, "learning_rate": 2.7255435641151265e-07, "loss": 0.0086, "reward": 2.209031343460083, "reward_std": 0.6874871253967285, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47465628385543823, "step": 4850 }, { "completion_length": 140.84375, "epoch": 2.595505617977528, "grad_norm": 0.6057919859886169, "kl": 0.1963566541671753, "learning_rate": 2.7184807616266325e-07, "loss": 0.0079, "reward": 2.4410624504089355, "reward_std": 0.8691391944885254, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4254375100135803, "step": 4851 }, { "completion_length": 115.21875, "epoch": 2.5960406634563937, "grad_norm": 1.537177324295044, "kl": 0.2577967941761017, "learning_rate": 2.711426595881955e-07, "loss": 0.0103, "reward": 1.87890625, "reward_std": 0.6741155982017517, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48828125, "step": 4852 }, { "completion_length": 149.5625, "epoch": 2.5965757089352595, "grad_norm": 0.9610663652420044, "kl": 0.29146289825439453, "learning_rate": 2.7043810696154443e-07, "loss": 0.0117, "reward": 1.7415624856948853, "reward_std": 0.8096280097961426, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.46031251549720764, "step": 4853 }, { "completion_length": 124.78125, "epoch": 2.597110754414125, "grad_norm": 1.2335965633392334, "kl": 0.18408560752868652, "learning_rate": 2.6973441855580736e-07, "loss": 0.0074, "reward": 3.328125, "reward_std": 0.3008032441139221, "rewards/correctness_reward_func": 1.875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.5, "step": 4854 }, { "completion_length": 139.34375, "epoch": 2.597645799892991, "grad_norm": 1.4173980951309204, "kl": 0.1591334044933319, "learning_rate": 2.690315946437466e-07, "loss": 0.0064, "reward": 1.6917188167572021, "reward_std": 0.550529956817627, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4573437571525574, "step": 4855 }, { "completion_length": 106.0625, "epoch": 2.5981808453718567, "grad_norm": 2.1192474365234375, "kl": 0.17549867928028107, "learning_rate": 2.6832963549779154e-07, "loss": 0.007, "reward": 2.446593761444092, "reward_std": 0.8796180486679077, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4934687614440918, "step": 4856 }, { "completion_length": 148.03125, "epoch": 2.5987158908507224, "grad_norm": 11.104905128479004, "kl": 0.21952663362026215, "learning_rate": 2.676285413900345e-07, "loss": 0.0088, "reward": 2.1479687690734863, "reward_std": 1.1201740503311157, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.41359376907348633, "step": 4857 }, { "completion_length": 129.90625, "epoch": 2.599250936329588, "grad_norm": 1.3619085550308228, "kl": 0.33453720808029175, "learning_rate": 2.669283125922328e-07, "loss": 0.0134, "reward": 2.019406318664551, "reward_std": 0.6933616399765015, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.472531259059906, "step": 4858 }, { "completion_length": 117.84375, "epoch": 2.599785981808454, "grad_norm": 0.8851857781410217, "kl": 0.20856843888759613, "learning_rate": 2.6622894937580826e-07, "loss": 0.0083, "reward": 2.4729061126708984, "reward_std": 0.7847467660903931, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4729062616825104, "step": 4859 }, { "completion_length": 127.9375, "epoch": 2.600321027287319, "grad_norm": 1.0427827835083008, "kl": 0.16843439638614655, "learning_rate": 2.6553045201184816e-07, "loss": 0.0067, "reward": 1.9704999923706055, "reward_std": 0.7351338863372803, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.45487499237060547, "step": 4860 }, { "completion_length": 105.84375, "epoch": 2.6008560727661854, "grad_norm": 6.834567546844482, "kl": 0.6946665644645691, "learning_rate": 2.6483282077110347e-07, "loss": 0.0278, "reward": 2.0155625343322754, "reward_std": 0.276606023311615, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.499937504529953, "step": 4861 }, { "completion_length": 115.8125, "epoch": 2.6013911182450506, "grad_norm": 2.19089412689209, "kl": 0.19273778796195984, "learning_rate": 2.6413605592398834e-07, "loss": 0.0077, "reward": 2.8683438301086426, "reward_std": 0.8051213026046753, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4933437407016754, "step": 4862 }, { "completion_length": 108.96875, "epoch": 2.6019261637239164, "grad_norm": 1.3877335786819458, "kl": 0.401988685131073, "learning_rate": 2.634401577405835e-07, "loss": 0.0161, "reward": 2.1663124561309814, "reward_std": 0.6493601202964783, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4788125157356262, "step": 4863 }, { "completion_length": 120.40625, "epoch": 2.602461209202782, "grad_norm": 0.9987930655479431, "kl": 0.31985846161842346, "learning_rate": 2.6274512649063165e-07, "loss": 0.0128, "reward": 1.827125072479248, "reward_std": 0.5429140329360962, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48337501287460327, "step": 4864 }, { "completion_length": 118.4375, "epoch": 2.602996254681648, "grad_norm": 0.7888460755348206, "kl": 0.16502386331558228, "learning_rate": 2.6205096244353995e-07, "loss": 0.0066, "reward": 2.8700313568115234, "reward_std": 1.0105676651000977, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4950312674045563, "step": 4865 }, { "completion_length": 133.6875, "epoch": 2.6035313001605136, "grad_norm": 0.6588671803474426, "kl": 0.18900220096111298, "learning_rate": 2.6135766586838083e-07, "loss": 0.0076, "reward": 2.18387508392334, "reward_std": 0.7703716158866882, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46512502431869507, "step": 4866 }, { "completion_length": 147.21875, "epoch": 2.6040663456393793, "grad_norm": 0.8015381097793579, "kl": 0.19591137766838074, "learning_rate": 2.6066523703388857e-07, "loss": 0.0078, "reward": 1.6233437061309814, "reward_std": 0.7746423482894897, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.4045937657356262, "step": 4867 }, { "completion_length": 122.75, "epoch": 2.604601391118245, "grad_norm": 0.7363163232803345, "kl": 0.17185352742671967, "learning_rate": 2.59973676208462e-07, "loss": 0.0069, "reward": 2.5536251068115234, "reward_std": 1.080182671546936, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4754999876022339, "step": 4868 }, { "completion_length": 142.875, "epoch": 2.605136436597111, "grad_norm": 0.9724155068397522, "kl": 0.14572134613990784, "learning_rate": 2.592829836601629e-07, "loss": 0.0058, "reward": 2.385531425476074, "reward_std": 0.7726714015007019, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4480312466621399, "step": 4869 }, { "completion_length": 139.90625, "epoch": 2.6056714820759765, "grad_norm": 1.7316279411315918, "kl": 0.1405261754989624, "learning_rate": 2.5859315965671763e-07, "loss": 0.0056, "reward": 2.0546250343322754, "reward_std": 0.8135286569595337, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.476500004529953, "step": 4870 }, { "completion_length": 134.0, "epoch": 2.6062065275548423, "grad_norm": 0.8852054476737976, "kl": 0.2153865098953247, "learning_rate": 2.57904204465515e-07, "loss": 0.0086, "reward": 1.8644062280654907, "reward_std": 0.6215662360191345, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4737812280654907, "step": 4871 }, { "completion_length": 134.71875, "epoch": 2.606741573033708, "grad_norm": 0.9008274674415588, "kl": 0.19933167099952698, "learning_rate": 2.5721611835360647e-07, "loss": 0.008, "reward": 1.4782187938690186, "reward_std": 0.5939121842384338, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4469687342643738, "step": 4872 }, { "completion_length": 150.625, "epoch": 2.6072766185125733, "grad_norm": 0.9663668870925903, "kl": 0.25991007685661316, "learning_rate": 2.565289015877093e-07, "loss": 0.0104, "reward": 1.5113749504089355, "reward_std": 0.8078463077545166, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4176250100135803, "step": 4873 }, { "completion_length": 124.6875, "epoch": 2.6078116639914395, "grad_norm": 4.938780784606934, "kl": 0.7171041965484619, "learning_rate": 2.558425544341994e-07, "loss": 0.0287, "reward": 1.664156198501587, "reward_std": 0.7548350691795349, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4610312581062317, "step": 4874 }, { "completion_length": 135.375, "epoch": 2.608346709470305, "grad_norm": 0.6105002164840698, "kl": 0.1760719120502472, "learning_rate": 2.5515707715911943e-07, "loss": 0.007, "reward": 2.029656410217285, "reward_std": 0.703182578086853, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46715623140335083, "step": 4875 }, { "completion_length": 146.1875, "epoch": 2.6088817549491705, "grad_norm": 1.2655339241027832, "kl": 0.17123115062713623, "learning_rate": 2.544724700281745e-07, "loss": 0.0068, "reward": 1.95703125, "reward_std": 0.9341597557067871, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48828125, "step": 4876 }, { "completion_length": 142.3125, "epoch": 2.6094168004280363, "grad_norm": 1.4878264665603638, "kl": 0.1923401951789856, "learning_rate": 2.537887333067293e-07, "loss": 0.0077, "reward": 1.4594062566757202, "reward_std": 0.6094259023666382, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4281562566757202, "step": 4877 }, { "completion_length": 110.1875, "epoch": 2.609951845906902, "grad_norm": 1.5069224834442139, "kl": 0.5500730872154236, "learning_rate": 2.5310586725981466e-07, "loss": 0.022, "reward": 2.303874969482422, "reward_std": 0.7943322658538818, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.47574999928474426, "step": 4878 }, { "completion_length": 109.1875, "epoch": 2.6104868913857677, "grad_norm": 0.9051865935325623, "kl": 0.1978195607662201, "learning_rate": 2.5242387215212246e-07, "loss": 0.0079, "reward": 2.7984061241149902, "reward_std": 0.3627651035785675, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4859062433242798, "step": 4879 }, { "completion_length": 141.6875, "epoch": 2.6110219368646335, "grad_norm": 12.290865898132324, "kl": 0.34387701749801636, "learning_rate": 2.5174274824800645e-07, "loss": 0.0138, "reward": 1.745437502861023, "reward_std": 0.8237606287002563, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.43293750286102295, "step": 4880 }, { "completion_length": 139.8125, "epoch": 2.611556982343499, "grad_norm": 1.612742304801941, "kl": 0.18061372637748718, "learning_rate": 2.5106249581148406e-07, "loss": 0.0072, "reward": 2.360156297683716, "reward_std": 0.73954176902771, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.40703123807907104, "step": 4881 }, { "completion_length": 124.03125, "epoch": 2.612092027822365, "grad_norm": 0.4510515034198761, "kl": 0.16319528222084045, "learning_rate": 2.5038311510623357e-07, "loss": 0.0065, "reward": 2.359375, "reward_std": 0.47974544763565063, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 4882 }, { "completion_length": 125.53125, "epoch": 2.6126270733012307, "grad_norm": 0.5941981077194214, "kl": 0.2522430419921875, "learning_rate": 2.4970460639559604e-07, "loss": 0.0101, "reward": 2.5877187252044678, "reward_std": 0.7109395265579224, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.49396875500679016, "step": 4883 }, { "completion_length": 130.1875, "epoch": 2.6131621187800964, "grad_norm": 1.1185455322265625, "kl": 0.1910172402858734, "learning_rate": 2.4902696994257516e-07, "loss": 0.0076, "reward": 2.3834376335144043, "reward_std": 0.5065972208976746, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.39906251430511475, "step": 4884 }, { "completion_length": 128.625, "epoch": 2.613697164258962, "grad_norm": 1.4480059146881104, "kl": 0.19990146160125732, "learning_rate": 2.48350206009835e-07, "loss": 0.008, "reward": 1.9296875, "reward_std": 0.7036109566688538, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4765625, "step": 4885 }, { "completion_length": 127.65625, "epoch": 2.6142322097378274, "grad_norm": 24.043485641479492, "kl": 1.1053669452667236, "learning_rate": 2.4767431485970247e-07, "loss": 0.0442, "reward": 1.4295313358306885, "reward_std": 0.6767362356185913, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4451562762260437, "step": 4886 }, { "completion_length": 129.84375, "epoch": 2.6147672552166936, "grad_norm": 3.4153542518615723, "kl": 0.25821229815483093, "learning_rate": 2.469992967541657e-07, "loss": 0.0103, "reward": 1.9965312480926514, "reward_std": 0.6937451362609863, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46528124809265137, "step": 4887 }, { "completion_length": 155.21875, "epoch": 2.615302300695559, "grad_norm": 0.933861494064331, "kl": 0.15006974339485168, "learning_rate": 2.463251519548751e-07, "loss": 0.006, "reward": 2.1142499446868896, "reward_std": 1.0285439491271973, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.4111250042915344, "step": 4888 }, { "completion_length": 117.1875, "epoch": 2.615837346174425, "grad_norm": 0.8247405886650085, "kl": 0.1652083694934845, "learning_rate": 2.456518807231423e-07, "loss": 0.0066, "reward": 2.7704062461853027, "reward_std": 0.6723759770393372, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48915624618530273, "step": 4889 }, { "completion_length": 133.25, "epoch": 2.6163723916532904, "grad_norm": 1.7738622426986694, "kl": 0.22886677086353302, "learning_rate": 2.44979483319939e-07, "loss": 0.0092, "reward": 1.7785937786102295, "reward_std": 0.6804461479187012, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4660937488079071, "step": 4890 }, { "completion_length": 137.15625, "epoch": 2.616907437132156, "grad_norm": 1.0346273183822632, "kl": 0.16910263895988464, "learning_rate": 2.4430796000590033e-07, "loss": 0.0068, "reward": 1.9567188024520874, "reward_std": 1.1754529476165771, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4567187428474426, "step": 4891 }, { "completion_length": 134.09375, "epoch": 2.617442482611022, "grad_norm": 0.9829836487770081, "kl": 0.19737552106380463, "learning_rate": 2.436373110413215e-07, "loss": 0.0079, "reward": 2.1413750648498535, "reward_std": 0.8237558007240295, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.45387500524520874, "step": 4892 }, { "completion_length": 141.71875, "epoch": 2.6179775280898876, "grad_norm": 0.8929731249809265, "kl": 0.2704375982284546, "learning_rate": 2.429675366861584e-07, "loss": 0.0108, "reward": 2.027218818664551, "reward_std": 0.6337257027626038, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.449093759059906, "step": 4893 }, { "completion_length": 140.125, "epoch": 2.6185125735687533, "grad_norm": 0.6529427766799927, "kl": 0.16209645569324493, "learning_rate": 2.422986372000294e-07, "loss": 0.0065, "reward": 1.4664688110351562, "reward_std": 0.5768090486526489, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4508437514305115, "step": 4894 }, { "completion_length": 118.78125, "epoch": 2.619047619047619, "grad_norm": 1.8506003618240356, "kl": 0.16551728546619415, "learning_rate": 2.4163061284221134e-07, "loss": 0.0066, "reward": 2.173093795776367, "reward_std": 0.6545591354370117, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4699687361717224, "step": 4895 }, { "completion_length": 140.46875, "epoch": 2.619582664526485, "grad_norm": 0.790698230266571, "kl": 0.1979818046092987, "learning_rate": 2.409634638716443e-07, "loss": 0.0079, "reward": 1.5747812986373901, "reward_std": 0.5808618068695068, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46540623903274536, "step": 4896 }, { "completion_length": 133.9375, "epoch": 2.6201177100053505, "grad_norm": 0.9898820519447327, "kl": 0.16934734582901, "learning_rate": 2.402971905469279e-07, "loss": 0.0068, "reward": 2.3350625038146973, "reward_std": 0.9293358325958252, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47568750381469727, "step": 4897 }, { "completion_length": 138.5625, "epoch": 2.6206527554842163, "grad_norm": 4.997610569000244, "kl": 0.5499590635299683, "learning_rate": 2.3963179312632176e-07, "loss": 0.022, "reward": 2.0887813568115234, "reward_std": 0.4932902157306671, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4325312376022339, "step": 4898 }, { "completion_length": 131.625, "epoch": 2.621187800963082, "grad_norm": 0.7813995480537415, "kl": 0.17975351214408875, "learning_rate": 2.3896727186774743e-07, "loss": 0.0072, "reward": 1.9850313663482666, "reward_std": 0.33639171719551086, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.46940624713897705, "step": 4899 }, { "completion_length": 119.0, "epoch": 2.6217228464419478, "grad_norm": 1.2640501260757446, "kl": 0.19811345636844635, "learning_rate": 2.3830362702878613e-07, "loss": 0.0079, "reward": 2.833343744277954, "reward_std": 0.3369138836860657, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4739687442779541, "step": 4900 }, { "completion_length": 153.0, "epoch": 2.622257891920813, "grad_norm": 1.9267675876617432, "kl": 0.33307090401649475, "learning_rate": 2.376408588666787e-07, "loss": 0.0133, "reward": 1.8828125, "reward_std": 1.1368064880371094, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4453125, "step": 4901 }, { "completion_length": 131.46875, "epoch": 2.6227929373996792, "grad_norm": 1.372067928314209, "kl": 0.29527997970581055, "learning_rate": 2.369789676383269e-07, "loss": 0.0118, "reward": 1.7265937328338623, "reward_std": 0.9511843919754028, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4609687328338623, "step": 4902 }, { "completion_length": 127.96875, "epoch": 2.6233279828785445, "grad_norm": 2.2265446186065674, "kl": 0.28552162647247314, "learning_rate": 2.3631795360029296e-07, "loss": 0.0114, "reward": 2.4274375438690186, "reward_std": 0.651891827583313, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4586874842643738, "step": 4903 }, { "completion_length": 133.21875, "epoch": 2.6238630283574103, "grad_norm": 1.52547287940979, "kl": 0.2793383002281189, "learning_rate": 2.3565781700879808e-07, "loss": 0.0112, "reward": 1.6019999980926514, "reward_std": 0.7613542675971985, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.46137499809265137, "step": 4904 }, { "completion_length": 142.75, "epoch": 2.624398073836276, "grad_norm": 3.9978623390197754, "kl": 0.1681193858385086, "learning_rate": 2.3499855811972393e-07, "loss": 0.0067, "reward": 1.7265625, "reward_std": 0.7404609322547913, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4609375, "step": 4905 }, { "completion_length": 125.75, "epoch": 2.6249331193151417, "grad_norm": 1.788809061050415, "kl": 0.2712358832359314, "learning_rate": 2.3434017718861224e-07, "loss": 0.0108, "reward": 1.9385937452316284, "reward_std": 0.8220996856689453, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.4229687750339508, "step": 4906 }, { "completion_length": 113.28125, "epoch": 2.6254681647940075, "grad_norm": 0.9339808225631714, "kl": 0.2092975378036499, "learning_rate": 2.3368267447066383e-07, "loss": 0.0084, "reward": 2.523250102996826, "reward_std": 0.8572158813476562, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.476375013589859, "step": 4907 }, { "completion_length": 115.5, "epoch": 2.626003210272873, "grad_norm": 0.6659748554229736, "kl": 0.18745586276054382, "learning_rate": 2.3302605022073904e-07, "loss": 0.0075, "reward": 2.3903751373291016, "reward_std": 0.3130946159362793, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.499750018119812, "step": 4908 }, { "completion_length": 122.53125, "epoch": 2.626538255751739, "grad_norm": 1.1837519407272339, "kl": 0.17611494660377502, "learning_rate": 2.323703046933587e-07, "loss": 0.007, "reward": 2.2908437252044678, "reward_std": 0.6753538250923157, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.44709375500679016, "step": 4909 }, { "completion_length": 136.4375, "epoch": 2.6270733012306047, "grad_norm": 0.8223354816436768, "kl": 0.16013729572296143, "learning_rate": 2.3171543814270198e-07, "loss": 0.0064, "reward": 2.1972498893737793, "reward_std": 0.6860733032226562, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.44725000858306885, "step": 4910 }, { "completion_length": 144.40625, "epoch": 2.6276083467094704, "grad_norm": 0.6818652749061584, "kl": 0.18893763422966003, "learning_rate": 2.3106145082260777e-07, "loss": 0.0076, "reward": 1.8271875381469727, "reward_std": 0.4614414870738983, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4209374785423279, "step": 4911 }, { "completion_length": 121.875, "epoch": 2.628143392188336, "grad_norm": 1.0257511138916016, "kl": 0.2194196879863739, "learning_rate": 2.3040834298657332e-07, "loss": 0.0088, "reward": 2.3123438358306885, "reward_std": 0.8571019768714905, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4842187762260437, "step": 4912 }, { "completion_length": 139.1875, "epoch": 2.628678437667202, "grad_norm": 1.434980869293213, "kl": 0.2704678773880005, "learning_rate": 2.2975611488775723e-07, "loss": 0.0108, "reward": 1.901187539100647, "reward_std": 0.6634011268615723, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4793125092983246, "step": 4913 }, { "completion_length": 117.625, "epoch": 2.629213483146067, "grad_norm": 0.8288341164588928, "kl": 0.23908674716949463, "learning_rate": 2.2910476677897447e-07, "loss": 0.0096, "reward": 2.154031276702881, "reward_std": 0.9131760597229004, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48215624690055847, "step": 4914 }, { "completion_length": 136.5, "epoch": 2.6297485286249334, "grad_norm": 2.928246021270752, "kl": 0.36996743083000183, "learning_rate": 2.2845429891270033e-07, "loss": 0.0148, "reward": 1.9375, "reward_std": 0.5867290496826172, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.484375, "step": 4915 }, { "completion_length": 104.0625, "epoch": 2.6302835741037986, "grad_norm": 0.6381888389587402, "kl": 0.2615523040294647, "learning_rate": 2.2780471154106864e-07, "loss": 0.0105, "reward": 3.130499839782715, "reward_std": 0.8062368631362915, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4898749887943268, "step": 4916 }, { "completion_length": 113.9375, "epoch": 2.6308186195826644, "grad_norm": 1.041969895362854, "kl": 0.2569020986557007, "learning_rate": 2.2715600491587247e-07, "loss": 0.0103, "reward": 2.1684374809265137, "reward_std": 0.7718713283538818, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48093751072883606, "step": 4917 }, { "completion_length": 128.5625, "epoch": 2.63135366506153, "grad_norm": 0.6162254214286804, "kl": 0.18009328842163086, "learning_rate": 2.2650817928856178e-07, "loss": 0.0072, "reward": 2.384312629699707, "reward_std": 0.5774322748184204, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4936875104904175, "step": 4918 }, { "completion_length": 166.09375, "epoch": 2.631888710540396, "grad_norm": 0.4599025249481201, "kl": 0.11893877387046814, "learning_rate": 2.2586123491024764e-07, "loss": 0.0048, "reward": 1.8957186937332153, "reward_std": 0.7111595869064331, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4425937533378601, "step": 4919 }, { "completion_length": 122.9375, "epoch": 2.6324237560192616, "grad_norm": 2.155306100845337, "kl": 0.33080625534057617, "learning_rate": 2.252151720316964e-07, "loss": 0.0132, "reward": 2.5825624465942383, "reward_std": 0.7623037695884705, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47318750619888306, "step": 4920 }, { "completion_length": 139.5625, "epoch": 2.6329588014981273, "grad_norm": 1.0947643518447876, "kl": 0.15515130758285522, "learning_rate": 2.2456999090333525e-07, "loss": 0.0062, "reward": 2.375218629837036, "reward_std": 1.0578186511993408, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4689687490463257, "step": 4921 }, { "completion_length": 113.25, "epoch": 2.633493846976993, "grad_norm": 0.7240694165229797, "kl": 0.18889480829238892, "learning_rate": 2.2392569177524998e-07, "loss": 0.0076, "reward": 2.322499990463257, "reward_std": 0.3964383602142334, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.49437499046325684, "step": 4922 }, { "completion_length": 152.9375, "epoch": 2.634028892455859, "grad_norm": 0.6789987087249756, "kl": 0.14363813400268555, "learning_rate": 2.2328227489718107e-07, "loss": 0.0057, "reward": 1.5747499465942383, "reward_std": 0.7182070016860962, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.46537500619888306, "step": 4923 }, { "completion_length": 142.59375, "epoch": 2.6345639379347245, "grad_norm": 0.8076709508895874, "kl": 0.17190615832805634, "learning_rate": 2.2263974051853072e-07, "loss": 0.0069, "reward": 2.190093755722046, "reward_std": 0.7702388167381287, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4557187557220459, "step": 4924 }, { "completion_length": 123.90625, "epoch": 2.6350989834135903, "grad_norm": 0.8719336986541748, "kl": 0.1715749204158783, "learning_rate": 2.2199808888835688e-07, "loss": 0.0069, "reward": 2.0012500286102295, "reward_std": 0.4315382242202759, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4699999988079071, "step": 4925 }, { "completion_length": 136.0625, "epoch": 2.635634028892456, "grad_norm": 2.340762138366699, "kl": 0.4891822636127472, "learning_rate": 2.213573202553762e-07, "loss": 0.0196, "reward": 2.019406318664551, "reward_std": 1.0441844463348389, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.472531259059906, "step": 4926 }, { "completion_length": 110.9375, "epoch": 2.6361690743713213, "grad_norm": 2.3336803913116455, "kl": 0.1850774884223938, "learning_rate": 2.2071743486796304e-07, "loss": 0.0074, "reward": 2.4217188358306885, "reward_std": 0.39112740755081177, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4685937464237213, "step": 4927 }, { "completion_length": 119.0, "epoch": 2.6367041198501875, "grad_norm": 0.49335452914237976, "kl": 0.22081170976161957, "learning_rate": 2.2007843297414954e-07, "loss": 0.0088, "reward": 2.2946250438690186, "reward_std": 0.5162919163703918, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4664999842643738, "step": 4928 }, { "completion_length": 140.03125, "epoch": 2.637239165329053, "grad_norm": 1.063476324081421, "kl": 0.16209901869297028, "learning_rate": 2.1944031482162425e-07, "loss": 0.0065, "reward": 1.567062497138977, "reward_std": 0.6906332969665527, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.42643749713897705, "step": 4929 }, { "completion_length": 128.59375, "epoch": 2.6377742108079185, "grad_norm": 0.6452338099479675, "kl": 0.19480562210083008, "learning_rate": 2.1880308065773432e-07, "loss": 0.0078, "reward": 1.7254999876022339, "reward_std": 0.9446747303009033, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4286249876022339, "step": 4930 }, { "completion_length": 134.28125, "epoch": 2.6383092562867843, "grad_norm": 1.1641325950622559, "kl": 0.18122166395187378, "learning_rate": 2.1816673072948436e-07, "loss": 0.0072, "reward": 1.7195625305175781, "reward_std": 0.7809972763061523, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.43831250071525574, "step": 4931 }, { "completion_length": 145.96875, "epoch": 2.63884430176565, "grad_norm": 1.0497970581054688, "kl": 0.15888477861881256, "learning_rate": 2.175312652835354e-07, "loss": 0.0064, "reward": 1.6115624904632568, "reward_std": 0.5711382627487183, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47093749046325684, "step": 4932 }, { "completion_length": 139.1875, "epoch": 2.6393793472445157, "grad_norm": 1.5913318395614624, "kl": 0.273851215839386, "learning_rate": 2.1689668456620545e-07, "loss": 0.011, "reward": 2.162937641143799, "reward_std": 0.958249568939209, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4598124921321869, "step": 4933 }, { "completion_length": 140.125, "epoch": 2.6399143927233815, "grad_norm": 1.0849800109863281, "kl": 0.13535207509994507, "learning_rate": 2.1626298882347158e-07, "loss": 0.0054, "reward": 2.4535937309265137, "reward_std": 0.7851350903511047, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.48484376072883606, "step": 4934 }, { "completion_length": 135.875, "epoch": 2.640449438202247, "grad_norm": 1.1358271837234497, "kl": 0.17294983565807343, "learning_rate": 2.1563017830096538e-07, "loss": 0.0069, "reward": 1.7339375019073486, "reward_std": 0.5780156850814819, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.49956250190734863, "step": 4935 }, { "completion_length": 127.71875, "epoch": 2.640984483681113, "grad_norm": 1.1057143211364746, "kl": 0.2039421796798706, "learning_rate": 2.1499825324397622e-07, "loss": 0.0082, "reward": 1.76171875, "reward_std": 0.6732672452926636, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48046875, "step": 4936 }, { "completion_length": 135.4375, "epoch": 2.6415195291599787, "grad_norm": 4.336056709289551, "kl": 0.5362308621406555, "learning_rate": 2.143672138974512e-07, "loss": 0.0214, "reward": 2.3795623779296875, "reward_std": 0.8819432258605957, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.48893749713897705, "step": 4937 }, { "completion_length": 142.53125, "epoch": 2.6420545746388444, "grad_norm": 0.9177870154380798, "kl": 0.17059434950351715, "learning_rate": 2.137370605059927e-07, "loss": 0.0068, "reward": 1.62890625, "reward_std": 0.4858771860599518, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.48828125, "step": 4938 }, { "completion_length": 143.28125, "epoch": 2.64258962011771, "grad_norm": 1.064853549003601, "kl": 0.3479408025741577, "learning_rate": 2.1310779331386032e-07, "loss": 0.0139, "reward": 1.5693750381469727, "reward_std": 0.7853681445121765, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.4756249785423279, "step": 4939 }, { "completion_length": 121.75, "epoch": 2.6431246655965754, "grad_norm": 0.590266227722168, "kl": 0.20787331461906433, "learning_rate": 2.1247941256496956e-07, "loss": 0.0083, "reward": 2.5625, "reward_std": 0.4082317352294922, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 4940 }, { "completion_length": 118.25, "epoch": 2.6436597110754416, "grad_norm": 0.46039941906929016, "kl": 0.19757679104804993, "learning_rate": 2.118519185028939e-07, "loss": 0.0079, "reward": 3.0445001125335693, "reward_std": 0.3708432912826538, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4976249933242798, "step": 4941 }, { "completion_length": 139.0, "epoch": 2.644194756554307, "grad_norm": 0.6736041903495789, "kl": 0.20636457204818726, "learning_rate": 2.112253113708615e-07, "loss": 0.0083, "reward": 1.9542499780654907, "reward_std": 0.9839709997177124, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4386250078678131, "step": 4942 }, { "completion_length": 125.15625, "epoch": 2.644729802033173, "grad_norm": 1.0906784534454346, "kl": 0.19424383342266083, "learning_rate": 2.10599591411757e-07, "loss": 0.0078, "reward": 1.8571875095367432, "reward_std": 0.8186757564544678, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46656250953674316, "step": 4943 }, { "completion_length": 128.4375, "epoch": 2.6452648475120384, "grad_norm": 0.7683455944061279, "kl": 0.2070275843143463, "learning_rate": 2.0997475886812253e-07, "loss": 0.0083, "reward": 1.6885625123977661, "reward_std": 0.6592059135437012, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4541875123977661, "step": 4944 }, { "completion_length": 92.65625, "epoch": 2.645799892990904, "grad_norm": 1.2141975164413452, "kl": 0.29969364404678345, "learning_rate": 2.0935081398215346e-07, "loss": 0.012, "reward": 3.183093786239624, "reward_std": 0.613432765007019, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.49559375643730164, "step": 4945 }, { "completion_length": 152.65625, "epoch": 2.64633493846977, "grad_norm": 3.720935583114624, "kl": 0.26410773396492004, "learning_rate": 2.0872775699570386e-07, "loss": 0.0106, "reward": 1.5219062566757202, "reward_std": 0.49142783880233765, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.3656562268733978, "step": 4946 }, { "completion_length": 113.875, "epoch": 2.6468699839486356, "grad_norm": 0.6802629828453064, "kl": 0.18477576971054077, "learning_rate": 2.0810558815028303e-07, "loss": 0.0074, "reward": 2.209749937057495, "reward_std": 0.4613471031188965, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4441249966621399, "step": 4947 }, { "completion_length": 140.15625, "epoch": 2.6474050294275013, "grad_norm": 1.982016682624817, "kl": 0.12889263033866882, "learning_rate": 2.0748430768705445e-07, "loss": 0.0052, "reward": 1.6861562728881836, "reward_std": 0.8946680426597595, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.265625, "rewards/xmlcount_reward_func": 0.3736562728881836, "step": 4948 }, { "completion_length": 120.8125, "epoch": 2.647940074906367, "grad_norm": 3.8653268814086914, "kl": 0.21763688325881958, "learning_rate": 2.0686391584683885e-07, "loss": 0.0087, "reward": 2.4793124198913574, "reward_std": 0.6285771131515503, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.4324374794960022, "step": 4949 }, { "completion_length": 125.0, "epoch": 2.648475120385233, "grad_norm": 0.9229434728622437, "kl": 0.2125953584909439, "learning_rate": 2.0624441287011215e-07, "loss": 0.0085, "reward": 2.5491561889648438, "reward_std": 0.83522629737854, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4397812485694885, "step": 4950 }, { "completion_length": 138.03125, "epoch": 2.6490101658640985, "grad_norm": 2.1109931468963623, "kl": 0.15175586938858032, "learning_rate": 2.056257989970048e-07, "loss": 0.0061, "reward": 1.839437484741211, "reward_std": 0.7029978036880493, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46443748474121094, "step": 4951 }, { "completion_length": 128.0625, "epoch": 2.6495452113429643, "grad_norm": 4.408036708831787, "kl": 0.13508588075637817, "learning_rate": 2.050080744673047e-07, "loss": 0.0054, "reward": 2.3884999752044678, "reward_std": 0.6599728465080261, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.43537500500679016, "step": 4952 }, { "completion_length": 141.34375, "epoch": 2.65008025682183, "grad_norm": 1.1471668481826782, "kl": 0.22188815474510193, "learning_rate": 2.0439123952045253e-07, "loss": 0.0089, "reward": 2.0887811183929443, "reward_std": 1.069579839706421, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4169062376022339, "step": 4953 }, { "completion_length": 126.21875, "epoch": 2.6506153023006958, "grad_norm": 0.9363610148429871, "kl": 0.16452518105506897, "learning_rate": 2.0377529439554566e-07, "loss": 0.0066, "reward": 1.89453125, "reward_std": 0.5297657251358032, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.48828125, "step": 4954 }, { "completion_length": 135.5, "epoch": 2.651150347779561, "grad_norm": 0.8744297027587891, "kl": 0.23479272425174713, "learning_rate": 2.031602393313367e-07, "loss": 0.0094, "reward": 1.3810625076293945, "reward_std": 0.5539407730102539, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.44356250762939453, "step": 4955 }, { "completion_length": 135.09375, "epoch": 2.6516853932584272, "grad_norm": 2.9856979846954346, "kl": 0.16105923056602478, "learning_rate": 2.025460745662325e-07, "loss": 0.0064, "reward": 1.8026561737060547, "reward_std": 0.57835853099823, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47453123331069946, "step": 4956 }, { "completion_length": 134.71875, "epoch": 2.6522204387372925, "grad_norm": 1.3416908979415894, "kl": 0.2690443992614746, "learning_rate": 2.0193280033829503e-07, "loss": 0.0108, "reward": 2.6815624237060547, "reward_std": 0.9258145093917847, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47843748331069946, "step": 4957 }, { "completion_length": 131.59375, "epoch": 2.6527554842161583, "grad_norm": 1.3039764165878296, "kl": 0.23966479301452637, "learning_rate": 2.0132041688524062e-07, "loss": 0.0096, "reward": 1.7492811679840088, "reward_std": 0.694396436214447, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.46803125739097595, "step": 4958 }, { "completion_length": 117.375, "epoch": 2.653290529695024, "grad_norm": 1.0908979177474976, "kl": 0.1973690390586853, "learning_rate": 2.0070892444444186e-07, "loss": 0.0079, "reward": 2.229249954223633, "reward_std": 1.0741019248962402, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4792500138282776, "step": 4959 }, { "completion_length": 137.40625, "epoch": 2.6538255751738897, "grad_norm": 1.5689034461975098, "kl": 0.19704954326152802, "learning_rate": 2.0009832325292412e-07, "loss": 0.0079, "reward": 1.7432498931884766, "reward_std": 0.833899974822998, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4307500123977661, "step": 4960 }, { "completion_length": 112.21875, "epoch": 2.6543606206527555, "grad_norm": 0.7186186909675598, "kl": 0.20692789554595947, "learning_rate": 1.9948861354736838e-07, "loss": 0.0083, "reward": 2.0607500076293945, "reward_std": 0.7114283442497253, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48262500762939453, "step": 4961 }, { "completion_length": 135.46875, "epoch": 2.654895666131621, "grad_norm": 1.9381706714630127, "kl": 0.3570467233657837, "learning_rate": 1.988797955641103e-07, "loss": 0.0143, "reward": 1.429281234741211, "reward_std": 0.7030156254768372, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.44490623474121094, "step": 4962 }, { "completion_length": 115.0, "epoch": 2.655430711610487, "grad_norm": 0.9049230813980103, "kl": 0.17674216628074646, "learning_rate": 1.9827186953913834e-07, "loss": 0.0071, "reward": 2.5868749618530273, "reward_std": 0.5890552401542664, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.49312499165534973, "step": 4963 }, { "completion_length": 136.03125, "epoch": 2.6559657570893527, "grad_norm": 0.6265022158622742, "kl": 0.16589924693107605, "learning_rate": 1.976648357080968e-07, "loss": 0.0066, "reward": 2.3261876106262207, "reward_std": 0.7593374848365784, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.45118749141693115, "step": 4964 }, { "completion_length": 109.5, "epoch": 2.6565008025682184, "grad_norm": 4.209417343139648, "kl": 0.3887489438056946, "learning_rate": 1.9705869430628467e-07, "loss": 0.0156, "reward": 2.87290620803833, "reward_std": 0.9737766981124878, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48228126764297485, "step": 4965 }, { "completion_length": 154.0, "epoch": 2.657035848047084, "grad_norm": 0.7276599407196045, "kl": 0.1735718548297882, "learning_rate": 1.9645344556865204e-07, "loss": 0.0069, "reward": 1.4606562852859497, "reward_std": 1.0240447521209717, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.3825312554836273, "step": 4966 }, { "completion_length": 143.71875, "epoch": 2.65757089352595, "grad_norm": 1.3810042142868042, "kl": 0.20329351723194122, "learning_rate": 1.9584908972980626e-07, "loss": 0.0081, "reward": 1.7642812728881836, "reward_std": 0.47179174423217773, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4205312430858612, "step": 4967 }, { "completion_length": 124.40625, "epoch": 2.658105939004815, "grad_norm": 1.0814156532287598, "kl": 0.16612306237220764, "learning_rate": 1.952456270240069e-07, "loss": 0.0066, "reward": 2.2568438053131104, "reward_std": 0.6840674877166748, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4755937457084656, "step": 4968 }, { "completion_length": 131.6875, "epoch": 2.6586409844836814, "grad_norm": 0.8849006295204163, "kl": 0.1495254933834076, "learning_rate": 1.9464305768516712e-07, "loss": 0.006, "reward": 1.9005000591278076, "reward_std": 0.5068273544311523, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47862499952316284, "step": 4969 }, { "completion_length": 153.71875, "epoch": 2.6591760299625467, "grad_norm": 1.015533685684204, "kl": 0.14747393131256104, "learning_rate": 1.9404138194685535e-07, "loss": 0.0059, "reward": 1.4120311737060547, "reward_std": 0.8476599454879761, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.41203123331069946, "step": 4970 }, { "completion_length": 159.8125, "epoch": 2.6597110754414124, "grad_norm": 1.3533210754394531, "kl": 0.16593870520591736, "learning_rate": 1.9344060004229227e-07, "loss": 0.0067, "reward": 1.7211875915527344, "reward_std": 0.5484685897827148, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4243125319480896, "step": 4971 }, { "completion_length": 133.0625, "epoch": 2.660246120920278, "grad_norm": 0.867661714553833, "kl": 0.19974513351917267, "learning_rate": 1.9284071220435214e-07, "loss": 0.008, "reward": 2.640625, "reward_std": 0.6870663166046143, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 4972 }, { "completion_length": 139.65625, "epoch": 2.660781166399144, "grad_norm": 0.935877799987793, "kl": 0.21719932556152344, "learning_rate": 1.9224171866556285e-07, "loss": 0.0087, "reward": 1.5188437700271606, "reward_std": 0.5679771900177002, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.47196877002716064, "step": 4973 }, { "completion_length": 134.28125, "epoch": 2.6613162118780096, "grad_norm": 0.7486304640769958, "kl": 0.16234681010246277, "learning_rate": 1.9164361965810647e-07, "loss": 0.0065, "reward": 1.725906252861023, "reward_std": 0.8006018400192261, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46028125286102295, "step": 4974 }, { "completion_length": 125.25, "epoch": 2.6618512573568753, "grad_norm": 2.074491500854492, "kl": 0.1704656481742859, "learning_rate": 1.9104641541381723e-07, "loss": 0.0068, "reward": 2.0448436737060547, "reward_std": 0.9320517778396606, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.46671873331069946, "step": 4975 }, { "completion_length": 130.3125, "epoch": 2.662386302835741, "grad_norm": 0.8391955494880676, "kl": 0.16778141260147095, "learning_rate": 1.9045010616418252e-07, "loss": 0.0067, "reward": 2.248000144958496, "reward_std": 0.7771969437599182, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.46675002574920654, "step": 4976 }, { "completion_length": 116.9375, "epoch": 2.662921348314607, "grad_norm": 0.945091962814331, "kl": 0.1733623445034027, "learning_rate": 1.8985469214034406e-07, "loss": 0.0069, "reward": 2.343625068664551, "reward_std": 0.8200249075889587, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.499875009059906, "step": 4977 }, { "completion_length": 136.25, "epoch": 2.6634563937934725, "grad_norm": 0.7950728535652161, "kl": 0.2060203105211258, "learning_rate": 1.8926017357309535e-07, "loss": 0.0082, "reward": 1.8190937042236328, "reward_std": 0.6031901240348816, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4909687638282776, "step": 4978 }, { "completion_length": 146.5, "epoch": 2.6639914392723383, "grad_norm": 0.7227170467376709, "kl": 0.19889044761657715, "learning_rate": 1.886665506928828e-07, "loss": 0.008, "reward": 1.7103437185287476, "reward_std": 0.7364458441734314, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.42909374833106995, "step": 4979 }, { "completion_length": 156.40625, "epoch": 2.664526484751204, "grad_norm": 0.6225019693374634, "kl": 0.15960833430290222, "learning_rate": 1.8807382372980688e-07, "loss": 0.0064, "reward": 1.7430624961853027, "reward_std": 1.031076192855835, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.43056249618530273, "step": 4980 }, { "completion_length": 147.0625, "epoch": 2.6650615302300693, "grad_norm": 0.6172080039978027, "kl": 0.17703776061534882, "learning_rate": 1.874819929136193e-07, "loss": 0.0071, "reward": 1.2380937337875366, "reward_std": 0.4910653233528137, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.234375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4568437337875366, "step": 4981 }, { "completion_length": 142.6875, "epoch": 2.6655965757089355, "grad_norm": 0.9033814668655396, "kl": 0.17733702063560486, "learning_rate": 1.8689105847372517e-07, "loss": 0.0071, "reward": 2.5989999771118164, "reward_std": 0.41979095339775085, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.4896250069141388, "step": 4982 }, { "completion_length": 140.25, "epoch": 2.666131621187801, "grad_norm": 0.5662732124328613, "kl": 0.1594749391078949, "learning_rate": 1.8630102063918159e-07, "loss": 0.0064, "reward": 1.7101874351501465, "reward_std": 0.5009382367134094, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.39768749475479126, "step": 4983 }, { "completion_length": 135.28125, "epoch": 2.6666666666666665, "grad_norm": 1.729491949081421, "kl": 0.15879645943641663, "learning_rate": 1.8571187963869948e-07, "loss": 0.0064, "reward": 1.6304062604904175, "reward_std": 0.7229142189025879, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4585312604904175, "step": 4984 }, { "completion_length": 142.5, "epoch": 2.6672017121455323, "grad_norm": 0.6411563754081726, "kl": 0.17652133107185364, "learning_rate": 1.8512363570064028e-07, "loss": 0.0071, "reward": 2.5272812843322754, "reward_std": 0.8588271141052246, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.496031254529953, "step": 4985 }, { "completion_length": 124.8125, "epoch": 2.667736757624398, "grad_norm": 0.9930601716041565, "kl": 0.22621004283428192, "learning_rate": 1.845362890530189e-07, "loss": 0.009, "reward": 1.8834688663482666, "reward_std": 0.8607423305511475, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.46159374713897705, "step": 4986 }, { "completion_length": 143.75, "epoch": 2.6682718031032637, "grad_norm": 1.270195722579956, "kl": 0.25158268213272095, "learning_rate": 1.8394983992350263e-07, "loss": 0.0101, "reward": 1.7974687814712524, "reward_std": 0.8858339786529541, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.45371875166893005, "step": 4987 }, { "completion_length": 137.03125, "epoch": 2.6688068485821295, "grad_norm": 304.72540283203125, "kl": 9.5274658203125, "learning_rate": 1.8336428853940992e-07, "loss": 0.3811, "reward": 2.3846561908721924, "reward_std": 0.6542004942893982, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46278125047683716, "step": 4988 }, { "completion_length": 138.59375, "epoch": 2.669341894060995, "grad_norm": 1.5524959564208984, "kl": 0.2586050033569336, "learning_rate": 1.8277963512771113e-07, "loss": 0.0103, "reward": 2.1973750591278076, "reward_std": 0.5078793168067932, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.47862499952316284, "step": 4989 }, { "completion_length": 117.9375, "epoch": 2.669876939539861, "grad_norm": 1.5014097690582275, "kl": 0.181706964969635, "learning_rate": 1.8219587991503086e-07, "loss": 0.0073, "reward": 1.434593677520752, "reward_std": 0.41643571853637695, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48146873712539673, "step": 4990 }, { "completion_length": 128.53125, "epoch": 2.6704119850187267, "grad_norm": 1.0838240385055542, "kl": 0.16376636922359467, "learning_rate": 1.8161302312764167e-07, "loss": 0.0066, "reward": 2.0361874103546143, "reward_std": 1.010046124458313, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4424375295639038, "step": 4991 }, { "completion_length": 93.75, "epoch": 2.6709470304975924, "grad_norm": 0.4058070480823517, "kl": 0.16834120452404022, "learning_rate": 1.8103106499147122e-07, "loss": 0.0067, "reward": 2.375, "reward_std": 0.2314550280570984, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 4992 }, { "completion_length": 99.40625, "epoch": 2.671482075976458, "grad_norm": 0.7975855469703674, "kl": 0.20664247870445251, "learning_rate": 1.804500057320982e-07, "loss": 0.0083, "reward": 2.738156318664551, "reward_std": 0.572078287601471, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.488156259059906, "step": 4993 }, { "completion_length": 143.0, "epoch": 2.6720171214553234, "grad_norm": 0.5599237680435181, "kl": 0.163793683052063, "learning_rate": 1.7986984557475106e-07, "loss": 0.0066, "reward": 1.786718726158142, "reward_std": 0.7761600017547607, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4585937559604645, "step": 4994 }, { "completion_length": 129.46875, "epoch": 2.6725521669341896, "grad_norm": 1.6077357530593872, "kl": 0.15321896970272064, "learning_rate": 1.792905847443116e-07, "loss": 0.0061, "reward": 2.292250156402588, "reward_std": 0.42995601892471313, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.41724997758865356, "step": 4995 }, { "completion_length": 112.09375, "epoch": 2.673087212413055, "grad_norm": 0.7600963115692139, "kl": 0.20328471064567566, "learning_rate": 1.7871222346531269e-07, "loss": 0.0081, "reward": 2.2348124980926514, "reward_std": 0.3627322316169739, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.45356249809265137, "step": 4996 }, { "completion_length": 122.8125, "epoch": 2.6736222578919207, "grad_norm": 0.9258863925933838, "kl": 0.16360363364219666, "learning_rate": 1.7813476196193723e-07, "loss": 0.0065, "reward": 2.325906276702881, "reward_std": 0.8407041430473328, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.43528127670288086, "step": 4997 }, { "completion_length": 125.5625, "epoch": 2.6741573033707864, "grad_norm": 1.6622915267944336, "kl": 0.2182941734790802, "learning_rate": 1.7755820045802146e-07, "loss": 0.0087, "reward": 2.2988438606262207, "reward_std": 1.0008387565612793, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47071874141693115, "step": 4998 }, { "completion_length": 138.09375, "epoch": 2.674692348849652, "grad_norm": 3.607865333557129, "kl": 0.3010059893131256, "learning_rate": 1.7698253917705134e-07, "loss": 0.012, "reward": 1.5999062061309814, "reward_std": 0.675437331199646, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.4592812657356262, "step": 4999 }, { "completion_length": 112.15625, "epoch": 2.675227394328518, "grad_norm": 4826774.0, "kl": 7270.01025390625, "learning_rate": 1.7640777834216416e-07, "loss": 290.8003, "reward": 2.2134063243865967, "reward_std": 0.7107417583465576, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4790312647819519, "step": 5000 }, { "completion_length": 110.0, "epoch": 2.6757624398073836, "grad_norm": 1.2631055116653442, "kl": 0.19709396362304688, "learning_rate": 1.758339181761476e-07, "loss": 0.0079, "reward": 2.137812614440918, "reward_std": 0.47980743646621704, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4815624952316284, "step": 5001 }, { "completion_length": 135.6875, "epoch": 2.6762974852862493, "grad_norm": 0.7731746435165405, "kl": 0.14041538536548615, "learning_rate": 1.75260958901442e-07, "loss": 0.0056, "reward": 2.59765625, "reward_std": 0.7906268835067749, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48828125, "step": 5002 }, { "completion_length": 126.9375, "epoch": 2.676832530765115, "grad_norm": 0.9277279376983643, "kl": 0.17055627703666687, "learning_rate": 1.7468890074013672e-07, "loss": 0.0068, "reward": 2.222156286239624, "reward_std": 0.44753384590148926, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.47215625643730164, "step": 5003 }, { "completion_length": 134.46875, "epoch": 2.677367576243981, "grad_norm": 1.0588092803955078, "kl": 0.3333333134651184, "learning_rate": 1.7411774391397213e-07, "loss": 0.0133, "reward": 1.863968849182129, "reward_std": 0.26675480604171753, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47334372997283936, "step": 5004 }, { "completion_length": 131.46875, "epoch": 2.6779026217228465, "grad_norm": 0.9666696190834045, "kl": 0.1955007016658783, "learning_rate": 1.7354748864434057e-07, "loss": 0.0078, "reward": 2.2963125705718994, "reward_std": 0.5357930064201355, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.49943751096725464, "step": 5005 }, { "completion_length": 115.90625, "epoch": 2.6784376672017123, "grad_norm": 1.1679962873458862, "kl": 0.45110276341438293, "learning_rate": 1.7297813515228328e-07, "loss": 0.018, "reward": 2.5191562175750732, "reward_std": 1.0760066509246826, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.48790624737739563, "step": 5006 }, { "completion_length": 123.875, "epoch": 2.6789727126805776, "grad_norm": 0.6402842402458191, "kl": 0.18445920944213867, "learning_rate": 1.7240968365849208e-07, "loss": 0.0074, "reward": 2.20703125, "reward_std": 0.6482566595077515, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48828125, "step": 5007 }, { "completion_length": 138.5, "epoch": 2.6795077581594438, "grad_norm": 0.4992249608039856, "kl": 0.15432162582874298, "learning_rate": 1.71842134383311e-07, "loss": 0.0062, "reward": 1.827625036239624, "reward_std": 0.48390674591064453, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.49950000643730164, "step": 5008 }, { "completion_length": 156.40625, "epoch": 2.680042803638309, "grad_norm": 2.4162216186523438, "kl": 0.518997311592102, "learning_rate": 1.7127548754673152e-07, "loss": 0.0208, "reward": 1.9806874990463257, "reward_std": 1.115095853805542, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.4181874990463257, "step": 5009 }, { "completion_length": 126.125, "epoch": 2.6805778491171752, "grad_norm": 1.1312490701675415, "kl": 0.15827825665473938, "learning_rate": 1.7070974336839797e-07, "loss": 0.0063, "reward": 2.326937675476074, "reward_std": 1.0896090269088745, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4363124966621399, "step": 5010 }, { "completion_length": 114.53125, "epoch": 2.6811128945960405, "grad_norm": 1.236908197402954, "kl": 0.18822850286960602, "learning_rate": 1.7014490206760298e-07, "loss": 0.0075, "reward": 2.1850311756134033, "reward_std": 0.8098620176315308, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4506562352180481, "step": 5011 }, { "completion_length": 120.09375, "epoch": 2.6816479400749063, "grad_norm": 1.280128002166748, "kl": 0.21597477793693542, "learning_rate": 1.6958096386328977e-07, "loss": 0.0086, "reward": 2.7139687538146973, "reward_std": 0.34081795811653137, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.44834375381469727, "step": 5012 }, { "completion_length": 108.59375, "epoch": 2.682182985553772, "grad_norm": 1.3085522651672363, "kl": 0.22593954205513, "learning_rate": 1.6901792897405233e-07, "loss": 0.009, "reward": 2.9097812175750732, "reward_std": 0.9304903745651245, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48790624737739563, "step": 5013 }, { "completion_length": 122.40625, "epoch": 2.6827180310326377, "grad_norm": 1.9504998922348022, "kl": 0.25457555055618286, "learning_rate": 1.6845579761813335e-07, "loss": 0.0102, "reward": 1.9925312995910645, "reward_std": 0.7594448924064636, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4456562399864197, "step": 5014 }, { "completion_length": 146.3125, "epoch": 2.6832530765115035, "grad_norm": 1.3520983457565308, "kl": 0.25375092029571533, "learning_rate": 1.6789457001342575e-07, "loss": 0.0101, "reward": 1.6376874446868896, "reward_std": 1.0789921283721924, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.4033125042915344, "step": 5015 }, { "completion_length": 127.125, "epoch": 2.683788121990369, "grad_norm": 1.493973970413208, "kl": 0.2592187523841858, "learning_rate": 1.673342463774713e-07, "loss": 0.0104, "reward": 2.8395938873291016, "reward_std": 0.491386353969574, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.480218768119812, "step": 5016 }, { "completion_length": 120.875, "epoch": 2.684323167469235, "grad_norm": 1.9911155700683594, "kl": 0.15614736080169678, "learning_rate": 1.6677482692746382e-07, "loss": 0.0062, "reward": 2.09375, "reward_std": 0.995941162109375, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.453125, "step": 5017 }, { "completion_length": 134.28125, "epoch": 2.6848582129481007, "grad_norm": 1.6156806945800781, "kl": 0.23526296019554138, "learning_rate": 1.6621631188024373e-07, "loss": 0.0094, "reward": 2.0545310974121094, "reward_std": 1.0538990497589111, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4451562464237213, "step": 5018 }, { "completion_length": 162.75, "epoch": 2.6853932584269664, "grad_norm": 11.122580528259277, "kl": 0.48419544100761414, "learning_rate": 1.656587014523023e-07, "loss": 0.0194, "reward": 1.270124912261963, "reward_std": 0.7402723431587219, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.39512500166893005, "step": 5019 }, { "completion_length": 153.625, "epoch": 2.685928303905832, "grad_norm": 1.1791099309921265, "kl": 0.26236557960510254, "learning_rate": 1.6510199585978077e-07, "loss": 0.0105, "reward": 1.6475000381469727, "reward_std": 0.8485966324806213, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.3974999785423279, "step": 5020 }, { "completion_length": 122.1875, "epoch": 2.686463349384698, "grad_norm": 1.480478286743164, "kl": 0.19586659967899323, "learning_rate": 1.645461953184682e-07, "loss": 0.0078, "reward": 2.5166561603546143, "reward_std": 0.5823056697845459, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4697812497615814, "step": 5021 }, { "completion_length": 134.78125, "epoch": 2.686998394863563, "grad_norm": 1.4650344848632812, "kl": 0.31778374314308167, "learning_rate": 1.6399130004380341e-07, "loss": 0.0127, "reward": 1.9409375190734863, "reward_std": 1.2072422504425049, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.45656248927116394, "step": 5022 }, { "completion_length": 132.59375, "epoch": 2.6875334403424294, "grad_norm": 1.0487470626831055, "kl": 0.28542083501815796, "learning_rate": 1.6343731025087517e-07, "loss": 0.0114, "reward": 1.961343765258789, "reward_std": 0.8054364919662476, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.44571876525878906, "step": 5023 }, { "completion_length": 120.9375, "epoch": 2.6880684858212947, "grad_norm": 0.9502213597297668, "kl": 0.23330830037593842, "learning_rate": 1.6288422615442e-07, "loss": 0.0093, "reward": 2.229062557220459, "reward_std": 0.6566912531852722, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4790624976158142, "step": 5024 }, { "completion_length": 119.34375, "epoch": 2.6886035313001604, "grad_norm": 0.9848747253417969, "kl": 0.1744576394557953, "learning_rate": 1.6233204796882367e-07, "loss": 0.007, "reward": 2.951218605041504, "reward_std": 0.8394352197647095, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48246875405311584, "step": 5025 }, { "completion_length": 125.09375, "epoch": 2.689138576779026, "grad_norm": 0.7404921054840088, "kl": 0.20695273578166962, "learning_rate": 1.617807759081219e-07, "loss": 0.0083, "reward": 1.9573750495910645, "reward_std": 0.6154044270515442, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4573749899864197, "step": 5026 }, { "completion_length": 136.5, "epoch": 2.689673622257892, "grad_norm": 2.8093924522399902, "kl": 0.18858444690704346, "learning_rate": 1.6123041018599766e-07, "loss": 0.0075, "reward": 1.3334062099456787, "reward_std": 0.7363686561584473, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4271562695503235, "step": 5027 }, { "completion_length": 131.5, "epoch": 2.6902086677367576, "grad_norm": 0.6410025954246521, "kl": 0.19898326694965363, "learning_rate": 1.6068095101578334e-07, "loss": 0.008, "reward": 2.100749969482422, "reward_std": 1.1544668674468994, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.46012499928474426, "step": 5028 }, { "completion_length": 157.25, "epoch": 2.6907437132156233, "grad_norm": 0.6289058923721313, "kl": 0.14169806241989136, "learning_rate": 1.601323986104597e-07, "loss": 0.0057, "reward": 1.7109375, "reward_std": 0.9203346371650696, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4296875, "step": 5029 }, { "completion_length": 138.3125, "epoch": 2.691278758694489, "grad_norm": 5.243817329406738, "kl": 0.5771572589874268, "learning_rate": 1.5958475318265637e-07, "loss": 0.0231, "reward": 1.2843749523162842, "reward_std": 0.5154773592948914, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.45625001192092896, "step": 5030 }, { "completion_length": 124.90625, "epoch": 2.691813804173355, "grad_norm": 1.5594559907913208, "kl": 0.1657978594303131, "learning_rate": 1.5903801494465132e-07, "loss": 0.0066, "reward": 1.8093750476837158, "reward_std": 0.5632444620132446, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.41874998807907104, "step": 5031 }, { "completion_length": 131.9375, "epoch": 2.6923488496522205, "grad_norm": 1.843940258026123, "kl": 0.16973859071731567, "learning_rate": 1.5849218410837031e-07, "loss": 0.0068, "reward": 2.4032812118530273, "reward_std": 0.8392753601074219, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48140624165534973, "step": 5032 }, { "completion_length": 124.03125, "epoch": 2.6928838951310863, "grad_norm": 0.9626078009605408, "kl": 0.29644066095352173, "learning_rate": 1.5794726088538908e-07, "loss": 0.0119, "reward": 1.8426250219345093, "reward_std": 0.9271305799484253, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4207499921321869, "step": 5033 }, { "completion_length": 119.375, "epoch": 2.693418940609952, "grad_norm": 0.5617833733558655, "kl": 0.2144608497619629, "learning_rate": 1.5740324548692842e-07, "loss": 0.0086, "reward": 2.3040623664855957, "reward_std": 0.5541949272155762, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.49156248569488525, "step": 5034 }, { "completion_length": 135.1875, "epoch": 2.6939539860888173, "grad_norm": 0.7908862233161926, "kl": 0.17835527658462524, "learning_rate": 1.5686013812386019e-07, "loss": 0.0071, "reward": 1.8917187452316284, "reward_std": 0.5610748529434204, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4542187452316284, "step": 5035 }, { "completion_length": 132.09375, "epoch": 2.6944890315676835, "grad_norm": 0.8458577990531921, "kl": 0.228354811668396, "learning_rate": 1.563179390067038e-07, "loss": 0.0091, "reward": 2.4269375801086426, "reward_std": 0.8026630878448486, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4113124907016754, "step": 5036 }, { "completion_length": 122.9375, "epoch": 2.695024077046549, "grad_norm": 2.725996255874634, "kl": 0.2231525480747223, "learning_rate": 1.5577664834562438e-07, "loss": 0.0089, "reward": 3.028249979019165, "reward_std": 0.7218142151832581, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4813750088214874, "step": 5037 }, { "completion_length": 122.8125, "epoch": 2.6955591225254145, "grad_norm": 1.3712255954742432, "kl": 0.1690860390663147, "learning_rate": 1.5523626635043808e-07, "loss": 0.0068, "reward": 2.00962495803833, "reward_std": 0.8731968402862549, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.47837498784065247, "step": 5038 }, { "completion_length": 140.71875, "epoch": 2.6960941680042803, "grad_norm": 0.6630473732948303, "kl": 0.19963693618774414, "learning_rate": 1.5469679323060677e-07, "loss": 0.008, "reward": 2.2196874618530273, "reward_std": 0.9952989816665649, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4696875214576721, "step": 5039 }, { "completion_length": 149.5, "epoch": 2.696629213483146, "grad_norm": 0.6553966999053955, "kl": 0.21363908052444458, "learning_rate": 1.541582291952401e-07, "loss": 0.0085, "reward": 1.3671875, "reward_std": 0.3689081370830536, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4765625, "step": 5040 }, { "completion_length": 121.28125, "epoch": 2.6971642589620117, "grad_norm": 0.8329000473022461, "kl": 0.16144071519374847, "learning_rate": 1.5362057445309665e-07, "loss": 0.0065, "reward": 2.456906318664551, "reward_std": 0.5889270901679993, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.488156259059906, "step": 5041 }, { "completion_length": 158.25, "epoch": 2.6976993044408775, "grad_norm": 1.9120734930038452, "kl": 0.23875144124031067, "learning_rate": 1.5308382921258135e-07, "loss": 0.0096, "reward": 1.5027812719345093, "reward_std": 0.7670342922210693, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.3934062421321869, "step": 5042 }, { "completion_length": 111.03125, "epoch": 2.698234349919743, "grad_norm": 0.7257035374641418, "kl": 0.2685118317604065, "learning_rate": 1.5254799368174672e-07, "loss": 0.0107, "reward": 2.665656328201294, "reward_std": 0.8033747673034668, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.47815626859664917, "step": 5043 }, { "completion_length": 143.59375, "epoch": 2.698769395398609, "grad_norm": 2.1498141288757324, "kl": 0.15785089135169983, "learning_rate": 1.5201306806829297e-07, "loss": 0.0063, "reward": 2.037468910217285, "reward_std": 0.9074698686599731, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4593437612056732, "step": 5044 }, { "completion_length": 132.3125, "epoch": 2.6993044408774747, "grad_norm": 1.3129030466079712, "kl": 0.1361820101737976, "learning_rate": 1.5147905257956758e-07, "loss": 0.0054, "reward": 1.5546562671661377, "reward_std": 0.6429727077484131, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4452812373638153, "step": 5045 }, { "completion_length": 120.71875, "epoch": 2.6998394863563404, "grad_norm": 1.2274463176727295, "kl": 0.20686094462871552, "learning_rate": 1.5094594742256552e-07, "loss": 0.0083, "reward": 2.516906261444092, "reward_std": 0.7049520015716553, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4856562614440918, "step": 5046 }, { "completion_length": 109.21875, "epoch": 2.700374531835206, "grad_norm": 1.0588221549987793, "kl": 0.2224940061569214, "learning_rate": 1.504137528039279e-07, "loss": 0.0089, "reward": 2.297968864440918, "reward_std": 0.5913151502609253, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4854687452316284, "step": 5047 }, { "completion_length": 133.6875, "epoch": 2.7009095773140714, "grad_norm": 1.0555038452148438, "kl": 0.26381078362464905, "learning_rate": 1.4988246892994413e-07, "loss": 0.0106, "reward": 1.5765937566757202, "reward_std": 0.8061937093734741, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4359687566757202, "step": 5048 }, { "completion_length": 134.5625, "epoch": 2.7014446227929376, "grad_norm": 2.637281894683838, "kl": 0.25262099504470825, "learning_rate": 1.4935209600654977e-07, "loss": 0.0101, "reward": 2.335343837738037, "reward_std": 1.0474562644958496, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.44471874833106995, "step": 5049 }, { "completion_length": 123.0, "epoch": 2.701979668271803, "grad_norm": 0.7987422347068787, "kl": 0.16864079236984253, "learning_rate": 1.488226342393273e-07, "loss": 0.0067, "reward": 3.036156177520752, "reward_std": 0.7172688245773315, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4736562669277191, "step": 5050 }, { "completion_length": 136.46875, "epoch": 2.7025147137506687, "grad_norm": 1.185326099395752, "kl": 0.2102138102054596, "learning_rate": 1.4829408383350673e-07, "loss": 0.0084, "reward": 1.7086563110351562, "reward_std": 0.6901732087135315, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4586562514305115, "step": 5051 }, { "completion_length": 147.59375, "epoch": 2.7030497592295344, "grad_norm": 9.196147918701172, "kl": 0.4903072118759155, "learning_rate": 1.477664449939642e-07, "loss": 0.0196, "reward": 1.874593734741211, "reward_std": 0.7023598551750183, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4683437645435333, "step": 5052 }, { "completion_length": 130.0625, "epoch": 2.7035848047084, "grad_norm": 6281364.0, "kl": 25568.736328125, "learning_rate": 1.4723971792522274e-07, "loss": 1022.7494, "reward": 2.0935938358306885, "reward_std": 0.6784015893936157, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4529687464237213, "step": 5053 }, { "completion_length": 115.5625, "epoch": 2.704119850187266, "grad_norm": 1.8406206369400024, "kl": 0.2365315705537796, "learning_rate": 1.4671390283145097e-07, "loss": 0.0095, "reward": 1.5489063262939453, "reward_std": 0.6655687093734741, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.45515626668930054, "step": 5054 }, { "completion_length": 149.75, "epoch": 2.7046548956661316, "grad_norm": 0.5577764511108398, "kl": 0.1433200240135193, "learning_rate": 1.461889999164659e-07, "loss": 0.0057, "reward": 1.6396561861038208, "reward_std": 0.6022042036056519, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4365312457084656, "step": 5055 }, { "completion_length": 137.875, "epoch": 2.7051899411449973, "grad_norm": 1.22615385055542, "kl": 0.24244792759418488, "learning_rate": 1.4566500938373002e-07, "loss": 0.0097, "reward": 1.8843125104904175, "reward_std": 0.763673722743988, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4936875104904175, "step": 5056 }, { "completion_length": 140.25, "epoch": 2.705724986623863, "grad_norm": 2.0120954513549805, "kl": 0.17481642961502075, "learning_rate": 1.4514193143635085e-07, "loss": 0.007, "reward": 1.937749981880188, "reward_std": 0.9403116703033447, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.437749981880188, "step": 5057 }, { "completion_length": 154.90625, "epoch": 2.706260032102729, "grad_norm": 1.494793176651001, "kl": 0.3013884127140045, "learning_rate": 1.4461976627708513e-07, "loss": 0.0121, "reward": 1.6303436756134033, "reward_std": 0.9865832328796387, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4115937352180481, "step": 5058 }, { "completion_length": 121.34375, "epoch": 2.7067950775815945, "grad_norm": 2.659083127975464, "kl": 0.16466784477233887, "learning_rate": 1.4409851410833288e-07, "loss": 0.0066, "reward": 1.77734375, "reward_std": 0.7130647897720337, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.46484375, "step": 5059 }, { "completion_length": 113.3125, "epoch": 2.7073301230604603, "grad_norm": 7.4303083419799805, "kl": 1.5384689569473267, "learning_rate": 1.4357817513214163e-07, "loss": 0.0615, "reward": 2.3433752059936523, "reward_std": 0.8780409097671509, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.49962499737739563, "step": 5060 }, { "completion_length": 135.625, "epoch": 2.7078651685393256, "grad_norm": 0.7564041614532471, "kl": 0.2007635235786438, "learning_rate": 1.430587495502056e-07, "loss": 0.008, "reward": 1.628499984741211, "reward_std": 0.7681260108947754, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.42537498474121094, "step": 5061 }, { "completion_length": 126.0, "epoch": 2.7084002140181918, "grad_norm": 1.7736707925796509, "kl": 0.20249944925308228, "learning_rate": 1.4254023756386265e-07, "loss": 0.0081, "reward": 2.5445001125335693, "reward_std": 0.6309590339660645, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4820000231266022, "step": 5062 }, { "completion_length": 134.8125, "epoch": 2.708935259497057, "grad_norm": 1.6248228549957275, "kl": 0.18000847101211548, "learning_rate": 1.4202263937409894e-07, "loss": 0.0072, "reward": 1.9379687309265137, "reward_std": 0.9840935468673706, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.43796876072883606, "step": 5063 }, { "completion_length": 133.3125, "epoch": 2.7094703049759232, "grad_norm": 0.4990469515323639, "kl": 0.16232949495315552, "learning_rate": 1.4150595518154536e-07, "loss": 0.0065, "reward": 2.103156328201294, "reward_std": 0.5587530732154846, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.44690626859664917, "step": 5064 }, { "completion_length": 151.0625, "epoch": 2.7100053504547885, "grad_norm": 1.2863023281097412, "kl": 0.1775388866662979, "learning_rate": 1.4099018518647812e-07, "loss": 0.0071, "reward": 1.7860312461853027, "reward_std": 0.722564697265625, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.45790624618530273, "step": 5065 }, { "completion_length": 135.875, "epoch": 2.7105403959336543, "grad_norm": 0.6119493842124939, "kl": 0.1562640517950058, "learning_rate": 1.4047532958882032e-07, "loss": 0.0063, "reward": 2.3203125, "reward_std": 0.7043894529342651, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4765625, "step": 5066 }, { "completion_length": 158.9375, "epoch": 2.71107544141252, "grad_norm": 0.4296797811985016, "kl": 0.1386576145887375, "learning_rate": 1.3996138858813928e-07, "loss": 0.0055, "reward": 1.2080625295639038, "reward_std": 0.333161860704422, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4268124997615814, "step": 5067 }, { "completion_length": 122.34375, "epoch": 2.7116104868913857, "grad_norm": 9.988106727600098, "kl": 0.2969169020652771, "learning_rate": 1.3944836238364818e-07, "loss": 0.0119, "reward": 2.250593662261963, "reward_std": 0.6647583246231079, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48496875166893005, "step": 5068 }, { "completion_length": 135.78125, "epoch": 2.7121455323702515, "grad_norm": 1.1513041257858276, "kl": 0.21870939433574677, "learning_rate": 1.389362511742065e-07, "loss": 0.0087, "reward": 2.011312484741211, "reward_std": 0.8050680160522461, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4331875145435333, "step": 5069 }, { "completion_length": 139.90625, "epoch": 2.712680577849117, "grad_norm": 1.6332933902740479, "kl": 0.16911010444164276, "learning_rate": 1.38425055158318e-07, "loss": 0.0068, "reward": 1.9151562452316284, "reward_std": 0.9858714938163757, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4464062452316284, "step": 5070 }, { "completion_length": 117.3125, "epoch": 2.713215623327983, "grad_norm": 3.1900711059570312, "kl": 0.26005569100379944, "learning_rate": 1.379147745341322e-07, "loss": 0.0104, "reward": 1.6023125648498535, "reward_std": 0.7441666722297668, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46168750524520874, "step": 5071 }, { "completion_length": 113.8125, "epoch": 2.7137506688068487, "grad_norm": 0.7484967708587646, "kl": 0.23042050004005432, "learning_rate": 1.3740540949944314e-07, "loss": 0.0092, "reward": 2.289468765258789, "reward_std": 1.0245481729507446, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.49259376525878906, "step": 5072 }, { "completion_length": 145.9375, "epoch": 2.7142857142857144, "grad_norm": 7.272564888000488, "kl": 0.5828892588615417, "learning_rate": 1.3689696025169118e-07, "loss": 0.0233, "reward": 1.2341562509536743, "reward_std": 0.809553861618042, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.3904062509536743, "step": 5073 }, { "completion_length": 114.625, "epoch": 2.71482075976458, "grad_norm": 0.5731979012489319, "kl": 0.188478022813797, "learning_rate": 1.3638942698796065e-07, "loss": 0.0075, "reward": 2.6438751220703125, "reward_std": 0.527918815612793, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48762500286102295, "step": 5074 }, { "completion_length": 151.1875, "epoch": 2.715355805243446, "grad_norm": 0.7680234909057617, "kl": 0.1840032935142517, "learning_rate": 1.3588280990498082e-07, "loss": 0.0074, "reward": 2.221343755722046, "reward_std": 1.153943657875061, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4557187557220459, "step": 5075 }, { "completion_length": 111.59375, "epoch": 2.715890850722311, "grad_norm": 0.5993170738220215, "kl": 0.17482060194015503, "learning_rate": 1.353771091991271e-07, "loss": 0.007, "reward": 1.8488438129425049, "reward_std": 1.0176856517791748, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4582187533378601, "step": 5076 }, { "completion_length": 111.65625, "epoch": 2.7164258962011774, "grad_norm": 0.34205037355422974, "kl": 0.1251520812511444, "learning_rate": 1.3487232506641774e-07, "loss": 0.005, "reward": 2.6323750019073486, "reward_std": 0.6808791160583496, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.49175000190734863, "step": 5077 }, { "completion_length": 136.15625, "epoch": 2.7169609416800427, "grad_norm": 1.0413800477981567, "kl": 0.19366802275180817, "learning_rate": 1.34368457702517e-07, "loss": 0.0077, "reward": 1.625, "reward_std": 0.819821834564209, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.453125, "step": 5078 }, { "completion_length": 124.0, "epoch": 2.7174959871589084, "grad_norm": 1.3021832704544067, "kl": 0.18247029185295105, "learning_rate": 1.338655073027345e-07, "loss": 0.0073, "reward": 1.906406283378601, "reward_std": 0.39143308997154236, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4689062535762787, "step": 5079 }, { "completion_length": 119.6875, "epoch": 2.718031032637774, "grad_norm": 1.1282621622085571, "kl": 0.19204412400722504, "learning_rate": 1.333634740620221e-07, "loss": 0.0077, "reward": 1.7770311832427979, "reward_std": 0.6338779330253601, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4801562428474426, "step": 5080 }, { "completion_length": 101.15625, "epoch": 2.71856607811664, "grad_norm": 1.4358744621276855, "kl": 0.18667137622833252, "learning_rate": 1.3286235817497828e-07, "loss": 0.0075, "reward": 2.581031322479248, "reward_std": 0.6750540733337402, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48728126287460327, "step": 5081 }, { "completion_length": 138.03125, "epoch": 2.7191011235955056, "grad_norm": 5.570402145385742, "kl": 0.19712993502616882, "learning_rate": 1.3236215983584515e-07, "loss": 0.0079, "reward": 1.9344062805175781, "reward_std": 0.8144604563713074, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46565625071525574, "step": 5082 }, { "completion_length": 131.96875, "epoch": 2.7196361690743713, "grad_norm": 1.1348153352737427, "kl": 0.18417581915855408, "learning_rate": 1.318628792385085e-07, "loss": 0.0074, "reward": 2.6838436126708984, "reward_std": 0.7001142501831055, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.465093731880188, "step": 5083 }, { "completion_length": 136.9375, "epoch": 2.720171214553237, "grad_norm": 0.9611102938652039, "kl": 0.16273772716522217, "learning_rate": 1.3136451657650013e-07, "loss": 0.0065, "reward": 2.25390625, "reward_std": 0.7262357473373413, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48828125, "step": 5084 }, { "completion_length": 126.5, "epoch": 2.720706260032103, "grad_norm": 1.7737606763839722, "kl": 0.1679208129644394, "learning_rate": 1.3086707204299415e-07, "loss": 0.0067, "reward": 1.7874687910079956, "reward_std": 0.9468878507614136, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4593437612056732, "step": 5085 }, { "completion_length": 131.53125, "epoch": 2.7212413055109685, "grad_norm": 1.034417986869812, "kl": 0.16180886328220367, "learning_rate": 1.3037054583080992e-07, "loss": 0.0065, "reward": 1.9296875, "reward_std": 0.6434518694877625, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4921875, "step": 5086 }, { "completion_length": 125.9375, "epoch": 2.7217763509898343, "grad_norm": 1.5418893098831177, "kl": 0.18639133870601654, "learning_rate": 1.2987493813240991e-07, "loss": 0.0075, "reward": 1.5347187519073486, "reward_std": 0.4944457709789276, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.47221875190734863, "step": 5087 }, { "completion_length": 137.5, "epoch": 2.7223113964687, "grad_norm": 6.03188943862915, "kl": 0.14424996078014374, "learning_rate": 1.293802491399021e-07, "loss": 0.0058, "reward": 1.897031307220459, "reward_std": 0.986897349357605, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4282812476158142, "step": 5088 }, { "completion_length": 149.25, "epoch": 2.7228464419475653, "grad_norm": 0.8546514511108398, "kl": 0.12964677810668945, "learning_rate": 1.2888647904503727e-07, "loss": 0.0052, "reward": 1.7591562271118164, "reward_std": 0.844726026058197, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4466562569141388, "step": 5089 }, { "completion_length": 119.75, "epoch": 2.7233814874264315, "grad_norm": 0.9641762971878052, "kl": 0.17527088522911072, "learning_rate": 1.2839362803920924e-07, "loss": 0.007, "reward": 1.8788437843322754, "reward_std": 0.8010373115539551, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.488218754529953, "step": 5090 }, { "completion_length": 148.8125, "epoch": 2.723916532905297, "grad_norm": 0.5928105711936951, "kl": 0.1497424840927124, "learning_rate": 1.2790169631345744e-07, "loss": 0.006, "reward": 1.5182499885559082, "reward_std": 0.7312904000282288, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4401249885559082, "step": 5091 }, { "completion_length": 124.96875, "epoch": 2.7244515783841625, "grad_norm": 2.0963170528411865, "kl": 0.22492791712284088, "learning_rate": 1.2741068405846406e-07, "loss": 0.009, "reward": 2.5592188835144043, "reward_std": 0.7173202037811279, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.44984376430511475, "step": 5092 }, { "completion_length": 145.34375, "epoch": 2.7249866238630283, "grad_norm": 7114566.5, "kl": 370232.25, "learning_rate": 1.269205914645541e-07, "loss": 14809.291, "reward": 1.6897187232971191, "reward_std": 0.5044546127319336, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.45534375309944153, "step": 5093 }, { "completion_length": 125.71875, "epoch": 2.725521669341894, "grad_norm": 0.6662471890449524, "kl": 0.2178836315870285, "learning_rate": 1.2643141872169752e-07, "loss": 0.0087, "reward": 1.372687578201294, "reward_std": 0.42625752091407776, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.45081251859664917, "step": 5094 }, { "completion_length": 112.96875, "epoch": 2.7260567148207597, "grad_norm": 0.5687112808227539, "kl": 0.17384377121925354, "learning_rate": 1.259431660195068e-07, "loss": 0.007, "reward": 2.629499912261963, "reward_std": 0.40864041447639465, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.47325000166893005, "step": 5095 }, { "completion_length": 123.34375, "epoch": 2.7265917602996255, "grad_norm": 1.5150060653686523, "kl": 0.1746421456336975, "learning_rate": 1.2545583354723777e-07, "loss": 0.007, "reward": 2.234250068664551, "reward_std": 0.6324112415313721, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.499875009059906, "step": 5096 }, { "completion_length": 115.84375, "epoch": 2.727126805778491, "grad_norm": 0.7303487062454224, "kl": 0.18607740104198456, "learning_rate": 1.2496942149379049e-07, "loss": 0.0074, "reward": 2.826437473297119, "reward_std": 0.5339358448982239, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46706250309944153, "step": 5097 }, { "completion_length": 115.3125, "epoch": 2.727661851257357, "grad_norm": 9.070145606994629, "kl": 0.24130114912986755, "learning_rate": 1.244839300477074e-07, "loss": 0.0097, "reward": 2.09375, "reward_std": 0.5746838450431824, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 5098 }, { "completion_length": 117.84375, "epoch": 2.7281968967362227, "grad_norm": 1.514715313911438, "kl": 0.36641740798950195, "learning_rate": 1.2399935939717388e-07, "loss": 0.0147, "reward": 2.64453125, "reward_std": 0.7693533897399902, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.48828125, "step": 5099 }, { "completion_length": 129.34375, "epoch": 2.7287319422150884, "grad_norm": 5.8329854011535645, "kl": 0.7321270704269409, "learning_rate": 1.235157097300188e-07, "loss": 0.0293, "reward": 2.36328125, "reward_std": 0.8734537363052368, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.48828125, "step": 5100 }, { "completion_length": 129.125, "epoch": 2.729266987693954, "grad_norm": 0.8580272793769836, "kl": 0.16873818635940552, "learning_rate": 1.2303298123371444e-07, "loss": 0.0067, "reward": 1.6688125133514404, "reward_std": 0.5756450891494751, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46568751335144043, "step": 5101 }, { "completion_length": 122.1875, "epoch": 2.7298020331728194, "grad_norm": 1.7020182609558105, "kl": 0.20246277749538422, "learning_rate": 1.2255117409537582e-07, "loss": 0.0081, "reward": 2.0977187156677246, "reward_std": 1.0676662921905518, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.441468745470047, "step": 5102 }, { "completion_length": 157.5625, "epoch": 2.7303370786516856, "grad_norm": 2.1320672035217285, "kl": 0.4315379858016968, "learning_rate": 1.2207028850175968e-07, "loss": 0.0173, "reward": 1.2214062213897705, "reward_std": 0.5868953466415405, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4557812511920929, "step": 5103 }, { "completion_length": 100.71875, "epoch": 2.730872124130551, "grad_norm": 1.1884959936141968, "kl": 0.2390529364347458, "learning_rate": 1.2159032463926773e-07, "loss": 0.0096, "reward": 1.8206250667572021, "reward_std": 0.6420402526855469, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4768750071525574, "step": 5104 }, { "completion_length": 137.28125, "epoch": 2.7314071696094167, "grad_norm": 0.7017916440963745, "kl": 0.14266934990882874, "learning_rate": 1.2111128269394174e-07, "loss": 0.0057, "reward": 2.085812568664551, "reward_std": 0.824424147605896, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.445187509059906, "step": 5105 }, { "completion_length": 139.1875, "epoch": 2.7319422150882824, "grad_norm": 2.32414174079895, "kl": 0.17247027158737183, "learning_rate": 1.2063316285146843e-07, "loss": 0.0069, "reward": 1.9284999370574951, "reward_std": 0.9776197671890259, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4597499966621399, "step": 5106 }, { "completion_length": 111.5625, "epoch": 2.732477260567148, "grad_norm": 0.8776276111602783, "kl": 0.18518024682998657, "learning_rate": 1.2015596529717676e-07, "loss": 0.0074, "reward": 1.913468837738037, "reward_std": 0.2023315280675888, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.47596877813339233, "step": 5107 }, { "completion_length": 135.84375, "epoch": 2.733012306046014, "grad_norm": 1.0037853717803955, "kl": 0.2727399170398712, "learning_rate": 1.1967969021603632e-07, "loss": 0.0109, "reward": 2.06640625, "reward_std": 0.9399433732032776, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48828125, "step": 5108 }, { "completion_length": 148.71875, "epoch": 2.7335473515248796, "grad_norm": 1.9205999374389648, "kl": 0.19397509098052979, "learning_rate": 1.192043377926616e-07, "loss": 0.0078, "reward": 2.2259998321533203, "reward_std": 0.9267634749412537, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46037501096725464, "step": 5109 }, { "completion_length": 140.75, "epoch": 2.7340823970037453, "grad_norm": 1.1800379753112793, "kl": 0.26808691024780273, "learning_rate": 1.1872990821130747e-07, "loss": 0.0107, "reward": 1.9144062995910645, "reward_std": 0.7189375162124634, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4612812399864197, "step": 5110 }, { "completion_length": 138.625, "epoch": 2.734617442482611, "grad_norm": 1.0370656251907349, "kl": 0.14943364262580872, "learning_rate": 1.182564016558721e-07, "loss": 0.006, "reward": 1.791812539100647, "reward_std": 0.6782114505767822, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4793125092983246, "step": 5111 }, { "completion_length": 142.9375, "epoch": 2.735152487961477, "grad_norm": 1.3805545568466187, "kl": 0.2142218053340912, "learning_rate": 1.1778381830989615e-07, "loss": 0.0086, "reward": 2.1971874237060547, "reward_std": 0.7637585401535034, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.44718748331069946, "step": 5112 }, { "completion_length": 115.9375, "epoch": 2.7356875334403425, "grad_norm": 0.45865917205810547, "kl": 0.1557600498199463, "learning_rate": 1.1731215835656202e-07, "loss": 0.0062, "reward": 2.7268123626708984, "reward_std": 0.33284538984298706, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.492437481880188, "step": 5113 }, { "completion_length": 119.375, "epoch": 2.7362225789192083, "grad_norm": 0.7443240284919739, "kl": 0.22539237141609192, "learning_rate": 1.1684142197869342e-07, "loss": 0.009, "reward": 2.223374843597412, "reward_std": 0.8828926086425781, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.45775002241134644, "step": 5114 }, { "completion_length": 147.5625, "epoch": 2.7367576243980736, "grad_norm": 1.48055100440979, "kl": 0.16601240634918213, "learning_rate": 1.1637160935875718e-07, "loss": 0.0066, "reward": 2.1874375343322754, "reward_std": 0.7485086917877197, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.453062504529953, "step": 5115 }, { "completion_length": 139.09375, "epoch": 2.7372926698769398, "grad_norm": 3.1331071853637695, "kl": 0.18582819402217865, "learning_rate": 1.1590272067886183e-07, "loss": 0.0074, "reward": 2.0325000286102295, "reward_std": 0.670566737651825, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4699999988079071, "step": 5116 }, { "completion_length": 126.40625, "epoch": 2.737827715355805, "grad_norm": 1.237234115600586, "kl": 0.2175261676311493, "learning_rate": 1.1543475612075749e-07, "loss": 0.0087, "reward": 2.178999900817871, "reward_std": 0.7960532307624817, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.47587502002716064, "step": 5117 }, { "completion_length": 113.875, "epoch": 2.738362760834671, "grad_norm": 0.7222383618354797, "kl": 0.22226229310035706, "learning_rate": 1.1496771586583605e-07, "loss": 0.0089, "reward": 2.7022500038146973, "reward_std": 0.6312946081161499, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.49912500381469727, "step": 5118 }, { "completion_length": 130.3125, "epoch": 2.7388978063135365, "grad_norm": 1.1928249597549438, "kl": 0.2101941704750061, "learning_rate": 1.1450160009513156e-07, "loss": 0.0084, "reward": 1.6258437633514404, "reward_std": 0.9510914087295532, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.43834373354911804, "step": 5119 }, { "completion_length": 139.84375, "epoch": 2.7394328517924023, "grad_norm": 2.2743496894836426, "kl": 0.1996987760066986, "learning_rate": 1.1403640898931922e-07, "loss": 0.008, "reward": 1.6784374713897705, "reward_std": 0.7928838729858398, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.3971874713897705, "step": 5120 }, { "completion_length": 106.1875, "epoch": 2.739967897271268, "grad_norm": 3.127642869949341, "kl": 0.19564011693000793, "learning_rate": 1.1357214272871592e-07, "loss": 0.0078, "reward": 2.373812437057495, "reward_std": 0.5723759531974792, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4675624966621399, "step": 5121 }, { "completion_length": 129.125, "epoch": 2.7405029427501337, "grad_norm": 1.379489779472351, "kl": 0.16050070524215698, "learning_rate": 1.1310880149328074e-07, "loss": 0.0064, "reward": 1.918968677520752, "reward_std": 0.9437030553817749, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.46584373712539673, "step": 5122 }, { "completion_length": 123.78125, "epoch": 2.7410379882289995, "grad_norm": 0.7456056475639343, "kl": 0.17704683542251587, "learning_rate": 1.1264638546261253e-07, "loss": 0.0071, "reward": 2.3172812461853027, "reward_std": 0.45782971382141113, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.45790624618530273, "step": 5123 }, { "completion_length": 119.90625, "epoch": 2.741573033707865, "grad_norm": 1.9461170434951782, "kl": 0.23655080795288086, "learning_rate": 1.1218489481595374e-07, "loss": 0.0095, "reward": 2.3496251106262207, "reward_std": 0.7847775220870972, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.47462499141693115, "step": 5124 }, { "completion_length": 122.6875, "epoch": 2.742108079186731, "grad_norm": 0.6383616328239441, "kl": 0.20616544783115387, "learning_rate": 1.1172432973218628e-07, "loss": 0.0082, "reward": 2.405562400817871, "reward_std": 1.096566915512085, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.46806252002716064, "step": 5125 }, { "completion_length": 129.71875, "epoch": 2.7426431246655967, "grad_norm": 1.201243281364441, "kl": 0.2358737289905548, "learning_rate": 1.1126469038983401e-07, "loss": 0.0094, "reward": 1.8123750686645508, "reward_std": 0.8943297266960144, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.499875009059906, "step": 5126 }, { "completion_length": 126.125, "epoch": 2.7431781701444624, "grad_norm": 1.1604666709899902, "kl": 0.18383030593395233, "learning_rate": 1.1080597696706246e-07, "loss": 0.0074, "reward": 1.7741562128067017, "reward_std": 0.5293416976928711, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.49290624260902405, "step": 5127 }, { "completion_length": 123.5, "epoch": 2.7437132156233277, "grad_norm": 2.6030354499816895, "kl": 0.19739481806755066, "learning_rate": 1.1034818964167749e-07, "loss": 0.0079, "reward": 1.6592187881469727, "reward_std": 0.6938918232917786, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.47171875834465027, "step": 5128 }, { "completion_length": 133.40625, "epoch": 2.744248261102194, "grad_norm": 0.9501461386680603, "kl": 0.21898791193962097, "learning_rate": 1.09891328591126e-07, "loss": 0.0088, "reward": 2.295562505722046, "reward_std": 0.8821324110031128, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4674375057220459, "step": 5129 }, { "completion_length": 147.28125, "epoch": 2.744783306581059, "grad_norm": 1.5911412239074707, "kl": 0.16969667375087738, "learning_rate": 1.0943539399249636e-07, "loss": 0.0068, "reward": 1.9140000343322754, "reward_std": 0.8632372617721558, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.476500004529953, "step": 5130 }, { "completion_length": 135.0625, "epoch": 2.7453183520599254, "grad_norm": 22.91193962097168, "kl": 1.3737589120864868, "learning_rate": 1.0898038602251748e-07, "loss": 0.055, "reward": 1.6094374656677246, "reward_std": 0.5561383962631226, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.468812495470047, "step": 5131 }, { "completion_length": 131.4375, "epoch": 2.7458533975387907, "grad_norm": 0.717231810092926, "kl": 0.18175506591796875, "learning_rate": 1.0852630485755939e-07, "loss": 0.0073, "reward": 1.9193124771118164, "reward_std": 0.8813450336456299, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4818125069141388, "step": 5132 }, { "completion_length": 146.40625, "epoch": 2.7463884430176564, "grad_norm": 0.5664874911308289, "kl": 0.1345721185207367, "learning_rate": 1.0807315067363184e-07, "loss": 0.0054, "reward": 1.9942500591278076, "reward_std": 0.8855065107345581, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.43174999952316284, "step": 5133 }, { "completion_length": 140.375, "epoch": 2.746923488496522, "grad_norm": 0.9796385169029236, "kl": 0.1778392791748047, "learning_rate": 1.076209236463871e-07, "loss": 0.0071, "reward": 2.3934688568115234, "reward_std": 0.8366597890853882, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4715937376022339, "step": 5134 }, { "completion_length": 133.65625, "epoch": 2.747458533975388, "grad_norm": 10044.9833984375, "kl": 40.55727005004883, "learning_rate": 1.0716962395111663e-07, "loss": 1.6223, "reward": 1.8713124990463257, "reward_std": 0.7954521179199219, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4650624990463257, "step": 5135 }, { "completion_length": 129.71875, "epoch": 2.7479935794542536, "grad_norm": 1.5227621793746948, "kl": 0.35095518827438354, "learning_rate": 1.0671925176275244e-07, "loss": 0.014, "reward": 2.298281192779541, "reward_std": 0.2859264314174652, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4389062523841858, "step": 5136 }, { "completion_length": 129.46875, "epoch": 2.7485286249331193, "grad_norm": 0.8232898116111755, "kl": 0.17712005972862244, "learning_rate": 1.0626980725586794e-07, "loss": 0.0071, "reward": 2.177281379699707, "reward_std": 0.9178026914596558, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4897812604904175, "step": 5137 }, { "completion_length": 126.09375, "epoch": 2.749063670411985, "grad_norm": 622813.9375, "kl": 102608.890625, "learning_rate": 1.0582129060467656e-07, "loss": 4104.3569, "reward": 2.0707502365112305, "reward_std": 1.0598549842834473, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.43012499809265137, "step": 5138 }, { "completion_length": 143.84375, "epoch": 2.749598715890851, "grad_norm": 0.868998646736145, "kl": 0.13657104969024658, "learning_rate": 1.0537370198303116e-07, "loss": 0.0055, "reward": 1.8267500400543213, "reward_std": 0.8121933341026306, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4673749804496765, "step": 5139 }, { "completion_length": 122.1875, "epoch": 2.7501337613697165, "grad_norm": 1.531678318977356, "kl": 0.26395899057388306, "learning_rate": 1.0492704156442657e-07, "loss": 0.0106, "reward": 1.8250937461853027, "reward_std": 0.679749608039856, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.46571874618530273, "step": 5140 }, { "completion_length": 156.6875, "epoch": 2.7506688068485823, "grad_norm": 0.7855131030082703, "kl": 0.179745152592659, "learning_rate": 1.0448130952199626e-07, "loss": 0.0072, "reward": 1.9632500410079956, "reward_std": 1.0709691047668457, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4320000112056732, "step": 5141 }, { "completion_length": 114.25, "epoch": 2.751203852327448, "grad_norm": 0.7921222448348999, "kl": 0.17251163721084595, "learning_rate": 1.0403650602851506e-07, "loss": 0.0069, "reward": 2.240000009536743, "reward_std": 0.9591190218925476, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.45875000953674316, "step": 5142 }, { "completion_length": 126.03125, "epoch": 2.7517388978063133, "grad_norm": 1.8189680576324463, "kl": 0.19637377560138702, "learning_rate": 1.0359263125639673e-07, "loss": 0.0079, "reward": 1.8854999542236328, "reward_std": 0.955930233001709, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4636249840259552, "step": 5143 }, { "completion_length": 136.75, "epoch": 2.7522739432851795, "grad_norm": 0.9623880386352539, "kl": 0.2340545654296875, "learning_rate": 1.0314968537769615e-07, "loss": 0.0094, "reward": 1.95703125, "reward_std": 0.6138797998428345, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47265625, "step": 5144 }, { "completion_length": 145.4375, "epoch": 2.752808988764045, "grad_norm": 142083040.0, "kl": 88739.4453125, "learning_rate": 1.0270766856410763e-07, "loss": 3549.5776, "reward": 1.3275938034057617, "reward_std": 0.625537097454071, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.40571874380111694, "step": 5145 }, { "completion_length": 97.9375, "epoch": 2.7533440342429105, "grad_norm": 0.518261194229126, "kl": 0.17501160502433777, "learning_rate": 1.0226658098696468e-07, "loss": 0.007, "reward": 2.9375, "reward_std": 0.5260357856750488, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 5146 }, { "completion_length": 117.34375, "epoch": 2.7538790797217763, "grad_norm": 0.8940033912658691, "kl": 0.2120707631111145, "learning_rate": 1.0182642281724248e-07, "loss": 0.0085, "reward": 1.9204063415527344, "reward_std": 0.40288645029067993, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4829062521457672, "step": 5147 }, { "completion_length": 109.875, "epoch": 2.754414125200642, "grad_norm": 1.0350090265274048, "kl": 0.17996755242347717, "learning_rate": 1.0138719422555343e-07, "loss": 0.0072, "reward": 2.203125, "reward_std": 0.5002449154853821, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 5148 }, { "completion_length": 138.5, "epoch": 2.7549491706795077, "grad_norm": 0.3382675051689148, "kl": 0.15150794386863708, "learning_rate": 1.0094889538215136e-07, "loss": 0.0061, "reward": 2.2766876220703125, "reward_std": 0.887595534324646, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.47981250286102295, "step": 5149 }, { "completion_length": 136.96875, "epoch": 2.7554842161583735, "grad_norm": 1.2972581386566162, "kl": 0.21492312848567963, "learning_rate": 1.0051152645693036e-07, "loss": 0.0086, "reward": 2.0648751258850098, "reward_std": 0.7111021280288696, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4398750066757202, "step": 5150 }, { "completion_length": 170.65625, "epoch": 2.756019261637239, "grad_norm": 0.585460901260376, "kl": 0.13771075010299683, "learning_rate": 1.0007508761942175e-07, "loss": 0.0055, "reward": 1.397531270980835, "reward_std": 0.7823863625526428, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.3506562411785126, "step": 5151 }, { "completion_length": 135.1875, "epoch": 2.756554307116105, "grad_norm": 1.5144305229187012, "kl": 0.22291752696037292, "learning_rate": 9.9639579038798e-08, "loss": 0.0089, "reward": 2.1077187061309814, "reward_std": 0.8513820767402649, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4670937657356262, "step": 5152 }, { "completion_length": 122.28125, "epoch": 2.7570893525949707, "grad_norm": 3.377743721008301, "kl": 0.4340606927871704, "learning_rate": 9.920500088387075e-08, "loss": 0.0174, "reward": 2.184000015258789, "reward_std": 0.6414614915847778, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48087501525878906, "step": 5153 }, { "completion_length": 140.15625, "epoch": 2.7576243980738364, "grad_norm": 0.5515365600585938, "kl": 0.12862572073936462, "learning_rate": 9.877135332309024e-08, "loss": 0.0051, "reward": 2.589718818664551, "reward_std": 0.5790774822235107, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.449093759059906, "step": 5154 }, { "completion_length": 153.8125, "epoch": 2.758159443552702, "grad_norm": 0.9017179012298584, "kl": 0.1442798376083374, "learning_rate": 9.833863652454755e-08, "loss": 0.0058, "reward": 1.9155625104904175, "reward_std": 0.8910685777664185, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4780625104904175, "step": 5155 }, { "completion_length": 120.78125, "epoch": 2.7586944890315674, "grad_norm": 16.60624885559082, "kl": 1.5960168838500977, "learning_rate": 9.790685065597105e-08, "loss": 0.0638, "reward": 2.3524374961853027, "reward_std": 0.664913535118103, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.49306249618530273, "step": 5156 }, { "completion_length": 137.90625, "epoch": 2.7592295345104336, "grad_norm": 0.6514831185340881, "kl": 0.21439510583877563, "learning_rate": 9.747599588472988e-08, "loss": 0.0086, "reward": 1.6983437538146973, "reward_std": 0.8906911611557007, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.44834375381469727, "step": 5157 }, { "completion_length": 132.53125, "epoch": 2.759764579989299, "grad_norm": 0.6263437867164612, "kl": 0.13701514899730682, "learning_rate": 9.704607237783104e-08, "loss": 0.0055, "reward": 2.8257498741149902, "reward_std": 0.7743638753890991, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4819999933242798, "step": 5158 }, { "completion_length": 102.5625, "epoch": 2.7602996254681647, "grad_norm": 1.1142868995666504, "kl": 0.18262463808059692, "learning_rate": 9.661708030192174e-08, "loss": 0.0073, "reward": 2.881124973297119, "reward_std": 0.6048960089683533, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.49050000309944153, "step": 5159 }, { "completion_length": 134.3125, "epoch": 2.7608346709470304, "grad_norm": 1.5481560230255127, "kl": 0.1935347020626068, "learning_rate": 9.618901982328704e-08, "loss": 0.0077, "reward": 1.9737187623977661, "reward_std": 0.8376790285110474, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4893437623977661, "step": 5160 }, { "completion_length": 144.46875, "epoch": 2.761369716425896, "grad_norm": 0.8009300231933594, "kl": 0.20776285231113434, "learning_rate": 9.576189110785144e-08, "loss": 0.0083, "reward": 1.85546875, "reward_std": 0.9645355343818665, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.48046875, "step": 5161 }, { "completion_length": 134.3125, "epoch": 2.761904761904762, "grad_norm": 150.08168029785156, "kl": 1.5136823654174805, "learning_rate": 9.533569432117862e-08, "loss": 0.0605, "reward": 1.5784063339233398, "reward_std": 1.0431077480316162, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.43778127431869507, "step": 5162 }, { "completion_length": 129.25, "epoch": 2.7624398073836276, "grad_norm": 1.0158002376556396, "kl": 0.22145263850688934, "learning_rate": 9.49104296284703e-08, "loss": 0.0089, "reward": 1.437250018119812, "reward_std": 0.4601094126701355, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.468500018119812, "step": 5163 }, { "completion_length": 97.375, "epoch": 2.7629748528624933, "grad_norm": 1.4025673866271973, "kl": 0.22148509323596954, "learning_rate": 9.448609719456687e-08, "loss": 0.0089, "reward": 2.9375, "reward_std": 0.4082317352294922, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 5164 }, { "completion_length": 114.53125, "epoch": 2.763509898341359, "grad_norm": 0.9572832584381104, "kl": 0.2761375606060028, "learning_rate": 9.406269718394868e-08, "loss": 0.011, "reward": 2.3380000591278076, "reward_std": 0.9270274639129639, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.49424999952316284, "step": 5165 }, { "completion_length": 134.6875, "epoch": 2.764044943820225, "grad_norm": 5229.23681640625, "kl": 522.1869506835938, "learning_rate": 9.364022976073278e-08, "loss": 20.8875, "reward": 2.2784063816070557, "reward_std": 1.1413168907165527, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4659062623977661, "step": 5166 }, { "completion_length": 135.25, "epoch": 2.7645799892990905, "grad_norm": 6.4337897300720215, "kl": 0.18010127544403076, "learning_rate": 9.321869508867571e-08, "loss": 0.0072, "reward": 1.936843752861023, "reward_std": 0.5433419942855835, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.42121875286102295, "step": 5167 }, { "completion_length": 126.375, "epoch": 2.7651150347779563, "grad_norm": 1.630800724029541, "kl": 0.1686224788427353, "learning_rate": 9.279809333117313e-08, "loss": 0.0067, "reward": 2.077625036239624, "reward_std": 0.9543975591659546, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.43700000643730164, "step": 5168 }, { "completion_length": 144.71875, "epoch": 2.7656500802568216, "grad_norm": 0.8193721771240234, "kl": 0.1419874131679535, "learning_rate": 9.237842465125768e-08, "loss": 0.0057, "reward": 1.8359375, "reward_std": 0.8463743925094604, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4453125, "step": 5169 }, { "completion_length": 130.1875, "epoch": 2.7661851257356878, "grad_norm": 1.665366530418396, "kl": 0.19514206051826477, "learning_rate": 9.19596892116012e-08, "loss": 0.0078, "reward": 2.2275938987731934, "reward_std": 0.36214467883110046, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4619687497615814, "step": 5170 }, { "completion_length": 107.0625, "epoch": 2.766720171214553, "grad_norm": 1.3047057390213013, "kl": 0.2050095796585083, "learning_rate": 9.154188717451329e-08, "loss": 0.0082, "reward": 2.5562186241149902, "reward_std": 0.8712092638015747, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4937187433242798, "step": 5171 }, { "completion_length": 122.21875, "epoch": 2.767255216693419, "grad_norm": 1.9123941659927368, "kl": 0.21596500277519226, "learning_rate": 9.112501870194273e-08, "loss": 0.0086, "reward": 2.097531318664551, "reward_std": 0.947482705116272, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.488156259059906, "step": 5172 }, { "completion_length": 134.90625, "epoch": 2.7677902621722845, "grad_norm": 13.124130249023438, "kl": 0.8550711870193481, "learning_rate": 9.070908395547501e-08, "loss": 0.0342, "reward": 2.1453750133514404, "reward_std": 1.0930923223495483, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.45787501335144043, "step": 5173 }, { "completion_length": 127.5, "epoch": 2.7683253076511503, "grad_norm": 1.1129366159439087, "kl": 0.27945196628570557, "learning_rate": 9.029408309633447e-08, "loss": 0.0112, "reward": 2.0530312061309814, "reward_std": 0.8796601295471191, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4592812657356262, "step": 5174 }, { "completion_length": 120.46875, "epoch": 2.768860353130016, "grad_norm": 0.9793901443481445, "kl": 0.16620594263076782, "learning_rate": 8.988001628538411e-08, "loss": 0.0066, "reward": 2.424062490463257, "reward_std": 0.8630651235580444, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4865625202655792, "step": 5175 }, { "completion_length": 142.65625, "epoch": 2.7693953986088817, "grad_norm": 1.4991506338119507, "kl": 0.2102103978395462, "learning_rate": 8.946688368312306e-08, "loss": 0.0084, "reward": 2.2039999961853027, "reward_std": 1.0366287231445312, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.43837499618530273, "step": 5176 }, { "completion_length": 135.71875, "epoch": 2.7699304440877475, "grad_norm": 0.9115444421768188, "kl": 0.22835856676101685, "learning_rate": 8.905468544969014e-08, "loss": 0.0091, "reward": 1.859375, "reward_std": 0.644726037979126, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.5, "step": 5177 }, { "completion_length": 120.0625, "epoch": 2.770465489566613, "grad_norm": 1.2575172185897827, "kl": 0.2053641527891159, "learning_rate": 8.864342174486145e-08, "loss": 0.0082, "reward": 2.846874952316284, "reward_std": 0.7023345232009888, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.47187501192092896, "step": 5178 }, { "completion_length": 105.3125, "epoch": 2.771000535045479, "grad_norm": 0.9188023209571838, "kl": 0.294126033782959, "learning_rate": 8.823309272804975e-08, "loss": 0.0118, "reward": 2.656125068664551, "reward_std": 1.0048869848251343, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.499875009059906, "step": 5179 }, { "completion_length": 136.3125, "epoch": 2.7715355805243447, "grad_norm": 1.4135901927947998, "kl": 0.16841411590576172, "learning_rate": 8.782369855830752e-08, "loss": 0.0067, "reward": 2.1966874599456787, "reward_std": 0.7421706914901733, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4310624897480011, "step": 5180 }, { "completion_length": 133.59375, "epoch": 2.7720706260032104, "grad_norm": 2.2410061359405518, "kl": 0.1756148785352707, "learning_rate": 8.741523939432339e-08, "loss": 0.007, "reward": 2.300656318664551, "reward_std": 0.6911646723747253, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.488156259059906, "step": 5181 }, { "completion_length": 97.34375, "epoch": 2.7726056714820757, "grad_norm": 2.100032329559326, "kl": 0.22136449813842773, "learning_rate": 8.700771539442377e-08, "loss": 0.0089, "reward": 3.109375, "reward_std": 0.6404510736465454, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.5, "step": 5182 }, { "completion_length": 121.375, "epoch": 2.773140716960942, "grad_norm": 0.6625149846076965, "kl": 0.18495284020900726, "learning_rate": 8.660112671657284e-08, "loss": 0.0074, "reward": 2.209812641143799, "reward_std": 0.5980671048164368, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4910624921321869, "step": 5183 }, { "completion_length": 127.6875, "epoch": 2.773675762439807, "grad_norm": 0.6467762589454651, "kl": 0.18860331177711487, "learning_rate": 8.619547351837259e-08, "loss": 0.0075, "reward": 2.3600311279296875, "reward_std": 1.1403590440750122, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.43815624713897705, "step": 5184 }, { "completion_length": 144.25, "epoch": 2.7742108079186734, "grad_norm": 1.9803085327148438, "kl": 0.28493162989616394, "learning_rate": 8.579075595706143e-08, "loss": 0.0114, "reward": 1.6486562490463257, "reward_std": 0.9664803743362427, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4142812490463257, "step": 5185 }, { "completion_length": 140.0, "epoch": 2.7747458533975387, "grad_norm": 3.3558385372161865, "kl": 0.27259352803230286, "learning_rate": 8.538697418951553e-08, "loss": 0.0109, "reward": 1.817906379699707, "reward_std": 0.7613081336021423, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4429062604904175, "step": 5186 }, { "completion_length": 128.84375, "epoch": 2.7752808988764044, "grad_norm": 1.339776873588562, "kl": 0.16329023241996765, "learning_rate": 8.498412837224884e-08, "loss": 0.0065, "reward": 2.96875, "reward_std": 0.7712129354476929, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 5187 }, { "completion_length": 155.5625, "epoch": 2.77581594435527, "grad_norm": 1.2994649410247803, "kl": 0.22474804520606995, "learning_rate": 8.458221866141231e-08, "loss": 0.009, "reward": 1.9486249685287476, "reward_std": 1.0840448141098022, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.40174999833106995, "step": 5188 }, { "completion_length": 133.9375, "epoch": 2.776350989834136, "grad_norm": 0.9755420088768005, "kl": 0.19073379039764404, "learning_rate": 8.418124521279297e-08, "loss": 0.0076, "reward": 2.200124979019165, "reward_std": 0.8327519297599792, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46574997901916504, "step": 5189 }, { "completion_length": 144.03125, "epoch": 2.7768860353130016, "grad_norm": 1.571750283241272, "kl": 0.17788301408290863, "learning_rate": 8.378120818181707e-08, "loss": 0.0071, "reward": 1.6853125095367432, "reward_std": 0.9339229464530945, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.41968750953674316, "step": 5190 }, { "completion_length": 125.875, "epoch": 2.7774210807918673, "grad_norm": 0.9147923588752747, "kl": 0.24598313868045807, "learning_rate": 8.338210772354555e-08, "loss": 0.0098, "reward": 2.106562614440918, "reward_std": 0.9152517914772034, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4659374952316284, "step": 5191 }, { "completion_length": 131.21875, "epoch": 2.777956126270733, "grad_norm": 0.9661767482757568, "kl": 0.15173138678073883, "learning_rate": 8.298394399267745e-08, "loss": 0.0061, "reward": 1.9004374742507935, "reward_std": 0.5434013605117798, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.47856250405311584, "step": 5192 }, { "completion_length": 136.4375, "epoch": 2.778491171749599, "grad_norm": 0.9054070711135864, "kl": 0.17240920662879944, "learning_rate": 8.258671714354988e-08, "loss": 0.0069, "reward": 1.7105624675750732, "reward_std": 0.6899800300598145, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46056249737739563, "step": 5193 }, { "completion_length": 122.21875, "epoch": 2.7790262172284645, "grad_norm": 11.253290176391602, "kl": 0.3764522969722748, "learning_rate": 8.21904273301341e-08, "loss": 0.0151, "reward": 2.6127500534057617, "reward_std": 0.7483386993408203, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48774999380111694, "step": 5194 }, { "completion_length": 131.375, "epoch": 2.7795612627073303, "grad_norm": 0.5975737571716309, "kl": 0.13633307814598083, "learning_rate": 8.17950747060403e-08, "loss": 0.0055, "reward": 1.6941875219345093, "reward_std": 0.8050549030303955, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4598124921321869, "step": 5195 }, { "completion_length": 119.5625, "epoch": 2.780096308186196, "grad_norm": 0.6367579698562622, "kl": 0.1802840232849121, "learning_rate": 8.14006594245148e-08, "loss": 0.0072, "reward": 2.389218807220459, "reward_std": 0.6241769790649414, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4829687476158142, "step": 5196 }, { "completion_length": 117.03125, "epoch": 2.7806313536650613, "grad_norm": 0.9207395911216736, "kl": 0.2138063609600067, "learning_rate": 8.100718163844028e-08, "loss": 0.0086, "reward": 2.225656270980835, "reward_std": 0.4865116477012634, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4600312411785126, "step": 5197 }, { "completion_length": 132.3125, "epoch": 2.7811663991439275, "grad_norm": 1.4689980745315552, "kl": 0.2834548354148865, "learning_rate": 8.061464150033644e-08, "loss": 0.0113, "reward": 2.260124921798706, "reward_std": 0.881768524646759, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.43199998140335083, "step": 5198 }, { "completion_length": 163.09375, "epoch": 2.781701444622793, "grad_norm": 0.7774863839149475, "kl": 0.16245132684707642, "learning_rate": 8.02230391623593e-08, "loss": 0.0065, "reward": 1.2630624771118164, "reward_std": 0.5723733305931091, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4036874771118164, "step": 5199 }, { "completion_length": 130.15625, "epoch": 2.7822364901016585, "grad_norm": 0.6076402068138123, "kl": 0.16910851001739502, "learning_rate": 7.983237477630135e-08, "loss": 0.0068, "reward": 2.1957812309265137, "reward_std": 0.36262422800064087, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46140626072883606, "step": 5200 }, { "completion_length": 134.40625, "epoch": 2.7827715355805243, "grad_norm": 0.7890190482139587, "kl": 0.1683223843574524, "learning_rate": 7.944264849359173e-08, "loss": 0.0067, "reward": 2.162374973297119, "reward_std": 0.9517679214477539, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.42800000309944153, "step": 5201 }, { "completion_length": 133.40625, "epoch": 2.78330658105939, "grad_norm": 0.7838730216026306, "kl": 0.15219417214393616, "learning_rate": 7.905386046529601e-08, "loss": 0.0061, "reward": 2.530750036239624, "reward_std": 0.9699180126190186, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.49950000643730164, "step": 5202 }, { "completion_length": 129.84375, "epoch": 2.7838416265382557, "grad_norm": 0.8497266173362732, "kl": 0.1387825459241867, "learning_rate": 7.866601084211528e-08, "loss": 0.0056, "reward": 2.1548125743865967, "reward_std": 0.378271222114563, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4673125147819519, "step": 5203 }, { "completion_length": 157.375, "epoch": 2.7843766720171215, "grad_norm": 0.508148729801178, "kl": 0.12719550728797913, "learning_rate": 7.827909977438792e-08, "loss": 0.0051, "reward": 1.704281210899353, "reward_std": 1.0165823698043823, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.3917812407016754, "step": 5204 }, { "completion_length": 134.15625, "epoch": 2.784911717495987, "grad_norm": 1.517164707183838, "kl": 0.17120589315891266, "learning_rate": 7.78931274120881e-08, "loss": 0.0068, "reward": 1.7757500410079956, "reward_std": 0.7680688500404358, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4320000112056732, "step": 5205 }, { "completion_length": 115.40625, "epoch": 2.785446762974853, "grad_norm": 1.70553457736969, "kl": 0.2789486050605774, "learning_rate": 7.750809390482589e-08, "loss": 0.0112, "reward": 2.4400312900543213, "reward_std": 1.0526831150054932, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4869062602519989, "step": 5206 }, { "completion_length": 107.90625, "epoch": 2.7859818084537187, "grad_norm": 1.6049541234970093, "kl": 0.2172432541847229, "learning_rate": 7.712399940184744e-08, "loss": 0.0087, "reward": 1.8547186851501465, "reward_std": 0.23990686237812042, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.49534374475479126, "step": 5207 }, { "completion_length": 127.5, "epoch": 2.7865168539325844, "grad_norm": 0.7201713919639587, "kl": 0.22248475253582, "learning_rate": 7.674084405203591e-08, "loss": 0.0089, "reward": 1.5329999923706055, "reward_std": 0.7506842017173767, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.43924999237060547, "step": 5208 }, { "completion_length": 138.0625, "epoch": 2.78705189941145, "grad_norm": 0.8170158863067627, "kl": 0.2092650681734085, "learning_rate": 7.635862800390914e-08, "loss": 0.0084, "reward": 1.83203125, "reward_std": 0.5308719873428345, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48828125, "step": 5209 }, { "completion_length": 132.40625, "epoch": 2.7875869448903154, "grad_norm": 0.5941784977912903, "kl": 0.13690629601478577, "learning_rate": 7.59773514056214e-08, "loss": 0.0055, "reward": 2.410031318664551, "reward_std": 0.748936653137207, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.488156259059906, "step": 5210 }, { "completion_length": 129.5, "epoch": 2.7881219903691816, "grad_norm": 0.8014156222343445, "kl": 0.18759940564632416, "learning_rate": 7.559701440496281e-08, "loss": 0.0075, "reward": 2.1248438358306885, "reward_std": 0.6513457298278809, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4529687464237213, "step": 5211 }, { "completion_length": 115.8125, "epoch": 2.788657035848047, "grad_norm": 0.9908415675163269, "kl": 0.19502143561840057, "learning_rate": 7.52176171493596e-08, "loss": 0.0078, "reward": 2.444812536239624, "reward_std": 0.9896490573883057, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47606250643730164, "step": 5212 }, { "completion_length": 116.15625, "epoch": 2.7891920813269127, "grad_norm": 1.4246208667755127, "kl": 0.220054492354393, "learning_rate": 7.483915978587303e-08, "loss": 0.0088, "reward": 2.0038437843322754, "reward_std": 0.8409900665283203, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.488218754529953, "step": 5213 }, { "completion_length": 152.625, "epoch": 2.7897271268057784, "grad_norm": 25.856029510498047, "kl": 0.71305912733078, "learning_rate": 7.44616424612002e-08, "loss": 0.0285, "reward": 1.686843752861023, "reward_std": 0.7470598816871643, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.45246875286102295, "step": 5214 }, { "completion_length": 138.71875, "epoch": 2.790262172284644, "grad_norm": 0.6872220635414124, "kl": 0.16446137428283691, "learning_rate": 7.408506532167458e-08, "loss": 0.0066, "reward": 1.6675000190734863, "reward_std": 0.6638685464859009, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46437498927116394, "step": 5215 }, { "completion_length": 132.65625, "epoch": 2.79079721776351, "grad_norm": 1.1740564107894897, "kl": 0.1726217269897461, "learning_rate": 7.370942851326474e-08, "loss": 0.0069, "reward": 2.0277187824249268, "reward_std": 0.5955439805984497, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.44959375262260437, "step": 5216 }, { "completion_length": 123.96875, "epoch": 2.7913322632423756, "grad_norm": 0.5069859623908997, "kl": 0.17108877003192902, "learning_rate": 7.333473218157416e-08, "loss": 0.0068, "reward": 2.48828125, "reward_std": 0.6224788427352905, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48828125, "step": 5217 }, { "completion_length": 143.5625, "epoch": 2.7918673087212413, "grad_norm": 0.9715997576713562, "kl": 0.17935049533843994, "learning_rate": 7.296097647184308e-08, "loss": 0.0072, "reward": 2.2478749752044678, "reward_std": 0.9657593369483948, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.41975000500679016, "step": 5218 }, { "completion_length": 141.09375, "epoch": 2.792402354200107, "grad_norm": 0.45250484347343445, "kl": 0.14757831394672394, "learning_rate": 7.25881615289456e-08, "loss": 0.0059, "reward": 1.6445624828338623, "reward_std": 0.42192894220352173, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4726874828338623, "step": 5219 }, { "completion_length": 125.21875, "epoch": 2.792937399678973, "grad_norm": 1.3448578119277954, "kl": 0.28249961137771606, "learning_rate": 7.221628749739224e-08, "loss": 0.0113, "reward": 2.299499988555908, "reward_std": 0.883568286895752, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4713750183582306, "step": 5220 }, { "completion_length": 121.90625, "epoch": 2.7934724451578385, "grad_norm": 1.1505557298660278, "kl": 0.18188077211380005, "learning_rate": 7.184535452132879e-08, "loss": 0.0073, "reward": 2.2166249752044678, "reward_std": 0.9251705408096313, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48225000500679016, "step": 5221 }, { "completion_length": 129.78125, "epoch": 2.7940074906367043, "grad_norm": 2.796123743057251, "kl": 0.2841985821723938, "learning_rate": 7.147536274453526e-08, "loss": 0.0114, "reward": 1.9816875457763672, "reward_std": 0.6370704770088196, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4660625159740448, "step": 5222 }, { "completion_length": 129.09375, "epoch": 2.7945425361155696, "grad_norm": 1.0577462911605835, "kl": 0.14900153875350952, "learning_rate": 7.110631231042858e-08, "loss": 0.006, "reward": 2.4536561965942383, "reward_std": 0.9314603209495544, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.43803125619888306, "step": 5223 }, { "completion_length": 151.21875, "epoch": 2.7950775815944358, "grad_norm": 1.028795599937439, "kl": 0.1974698007106781, "learning_rate": 7.073820336205878e-08, "loss": 0.0079, "reward": 1.1044374704360962, "reward_std": 0.7035184502601624, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.3856875002384186, "step": 5224 }, { "completion_length": 133.375, "epoch": 2.795612627073301, "grad_norm": 0.9228517413139343, "kl": 0.1887531280517578, "learning_rate": 7.037103604211253e-08, "loss": 0.0076, "reward": 2.7573750019073486, "reward_std": 1.2215509414672852, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46050000190734863, "step": 5225 }, { "completion_length": 142.875, "epoch": 2.796147672552167, "grad_norm": 0.8108000755310059, "kl": 0.14708749949932098, "learning_rate": 7.000481049291074e-08, "loss": 0.0059, "reward": 2.3862812519073486, "reward_std": 1.1923578977584839, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.41753125190734863, "step": 5226 }, { "completion_length": 127.25, "epoch": 2.7966827180310325, "grad_norm": 1.0561089515686035, "kl": 0.25467216968536377, "learning_rate": 6.963952685640929e-08, "loss": 0.0102, "reward": 1.9838125705718994, "reward_std": 0.892952561378479, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.45256251096725464, "step": 5227 }, { "completion_length": 125.59375, "epoch": 2.7972177635098983, "grad_norm": 2.351140260696411, "kl": 0.23304462432861328, "learning_rate": 6.927518527419963e-08, "loss": 0.0093, "reward": 2.163875102996826, "reward_std": 1.0055774450302124, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.476375013589859, "step": 5228 }, { "completion_length": 135.6875, "epoch": 2.797752808988764, "grad_norm": 0.9497690200805664, "kl": 0.21724849939346313, "learning_rate": 6.891178588750686e-08, "loss": 0.0087, "reward": 2.2579689025878906, "reward_std": 0.7846083641052246, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4767187535762787, "step": 5229 }, { "completion_length": 145.78125, "epoch": 2.7982878544676297, "grad_norm": 0.5341806411743164, "kl": 0.17030920088291168, "learning_rate": 6.854932883719217e-08, "loss": 0.0068, "reward": 1.9758437871932983, "reward_std": 0.8455259799957275, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.41334375739097595, "step": 5230 }, { "completion_length": 116.15625, "epoch": 2.7988228999464955, "grad_norm": 6.979129314422607, "kl": 0.5509225130081177, "learning_rate": 6.818781426375043e-08, "loss": 0.022, "reward": 2.108968734741211, "reward_std": 0.7231590747833252, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48396873474121094, "step": 5231 }, { "completion_length": 111.90625, "epoch": 2.799357945425361, "grad_norm": 1.4392009973526, "kl": 0.21567407250404358, "learning_rate": 6.782724230731147e-08, "loss": 0.0086, "reward": 2.886625051498413, "reward_std": 0.8424609899520874, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4959999918937683, "step": 5232 }, { "completion_length": 118.65625, "epoch": 2.799892990904227, "grad_norm": 0.32017216086387634, "kl": 0.1733016073703766, "learning_rate": 6.746761310764044e-08, "loss": 0.0069, "reward": 3.25, "reward_std": 0.4355512857437134, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 5233 }, { "completion_length": 127.71875, "epoch": 2.8004280363830927, "grad_norm": 0.7244096398353577, "kl": 0.17938384413719177, "learning_rate": 6.710892680413638e-08, "loss": 0.0072, "reward": 2.313406229019165, "reward_std": 1.0721471309661865, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4540312588214874, "step": 5234 }, { "completion_length": 118.25, "epoch": 2.8009630818619584, "grad_norm": 0.5731593370437622, "kl": 0.20363786816596985, "learning_rate": 6.675118353583254e-08, "loss": 0.0081, "reward": 2.30078125, "reward_std": 0.1300809234380722, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48828125, "step": 5235 }, { "completion_length": 133.25, "epoch": 2.8014981273408237, "grad_norm": 0.6426854729652405, "kl": 0.16300544142723083, "learning_rate": 6.639438344139798e-08, "loss": 0.0065, "reward": 2.25028133392334, "reward_std": 0.9673022627830505, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4690312445163727, "step": 5236 }, { "completion_length": 125.0625, "epoch": 2.80203317281969, "grad_norm": 2.479081153869629, "kl": 0.2036479413509369, "learning_rate": 6.603852665913401e-08, "loss": 0.0081, "reward": 2.117906093597412, "reward_std": 0.6378173828125, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.44603127241134644, "step": 5237 }, { "completion_length": 124.6875, "epoch": 2.802568218298555, "grad_norm": 1.2175027132034302, "kl": 0.2191234827041626, "learning_rate": 6.56836133269781e-08, "loss": 0.0088, "reward": 2.257625102996826, "reward_std": 0.9064962863922119, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4919999837875366, "step": 5238 }, { "completion_length": 102.71875, "epoch": 2.803103263777421, "grad_norm": 0.6454620957374573, "kl": 0.2429049015045166, "learning_rate": 6.53296435825021e-08, "loss": 0.0097, "reward": 2.1845312118530273, "reward_std": 0.44048064947128296, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.48140624165534973, "step": 5239 }, { "completion_length": 110.1875, "epoch": 2.8036383092562867, "grad_norm": 0.9058054089546204, "kl": 0.24510055780410767, "learning_rate": 6.497661756291046e-08, "loss": 0.0098, "reward": 3.1449687480926514, "reward_std": 0.5257841944694519, "rewards/correctness_reward_func": 1.6875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48871874809265137, "step": 5240 }, { "completion_length": 121.4375, "epoch": 2.8041733547351524, "grad_norm": 1.441596508026123, "kl": 0.21647518873214722, "learning_rate": 6.462453540504343e-08, "loss": 0.0087, "reward": 2.432312488555908, "reward_std": 0.6020525693893433, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4635624885559082, "step": 5241 }, { "completion_length": 125.40625, "epoch": 2.804708400214018, "grad_norm": 0.9780990481376648, "kl": 0.14696434140205383, "learning_rate": 6.42733972453749e-08, "loss": 0.0059, "reward": 1.9521875381469727, "reward_std": 0.8719520568847656, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48343750834465027, "step": 5242 }, { "completion_length": 126.875, "epoch": 2.805243445692884, "grad_norm": 0.6312953233718872, "kl": 0.1824401319026947, "learning_rate": 6.392320322001184e-08, "loss": 0.0073, "reward": 2.718125104904175, "reward_std": 0.904035210609436, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.49937501549720764, "step": 5243 }, { "completion_length": 127.625, "epoch": 2.8057784911717496, "grad_norm": 1.9728448390960693, "kl": 0.21040740609169006, "learning_rate": 6.357395346469731e-08, "loss": 0.0084, "reward": 2.4614062309265137, "reward_std": 0.8989315032958984, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.46140626072883606, "step": 5244 }, { "completion_length": 131.75, "epoch": 2.8063135366506153, "grad_norm": 12.956932067871094, "kl": 2.9214935302734375, "learning_rate": 6.322564811480664e-08, "loss": 0.1169, "reward": 1.7001874446868896, "reward_std": 0.8622453212738037, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.45018747448921204, "step": 5245 }, { "completion_length": 118.375, "epoch": 2.806848582129481, "grad_norm": 0.7848894000053406, "kl": 0.19134455919265747, "learning_rate": 6.287828730534962e-08, "loss": 0.0077, "reward": 2.0722498893737793, "reward_std": 0.8395121097564697, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46287500858306885, "step": 5246 }, { "completion_length": 128.46875, "epoch": 2.807383627608347, "grad_norm": 7.053023815155029, "kl": 0.37304216623306274, "learning_rate": 6.253187117096992e-08, "loss": 0.0149, "reward": 1.7660937309265137, "reward_std": 0.5768507719039917, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.48484376072883606, "step": 5247 }, { "completion_length": 115.375, "epoch": 2.8079186730872125, "grad_norm": 4.691122055053711, "kl": 0.28312239050865173, "learning_rate": 6.218639984594544e-08, "loss": 0.0113, "reward": 2.8359999656677246, "reward_std": 0.7017310857772827, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.492249995470047, "step": 5248 }, { "completion_length": 133.03125, "epoch": 2.808453718566078, "grad_norm": 1.8140816688537598, "kl": 0.20744897425174713, "learning_rate": 6.184187346418735e-08, "loss": 0.0083, "reward": 2.491781234741211, "reward_std": 1.043269395828247, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.46053123474121094, "step": 5249 }, { "completion_length": 98.8125, "epoch": 2.808988764044944, "grad_norm": 1.3595502376556396, "kl": 0.242448091506958, "learning_rate": 6.149829215924025e-08, "loss": 0.0097, "reward": 2.8496251106262207, "reward_std": 0.813728928565979, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.49024999141693115, "step": 5250 }, { "completion_length": 146.03125, "epoch": 2.8095238095238093, "grad_norm": 1.0497809648513794, "kl": 0.17085033655166626, "learning_rate": 6.115565606428343e-08, "loss": 0.0068, "reward": 1.8284687995910645, "reward_std": 0.6073808670043945, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4065937399864197, "step": 5251 }, { "completion_length": 120.9375, "epoch": 2.8100588550026755, "grad_norm": 1.188085675239563, "kl": 0.3131234645843506, "learning_rate": 6.081396531212896e-08, "loss": 0.0125, "reward": 2.8009376525878906, "reward_std": 1.106989860534668, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4728125035762787, "step": 5252 }, { "completion_length": 137.375, "epoch": 2.810593900481541, "grad_norm": 1.0704939365386963, "kl": 0.20323985815048218, "learning_rate": 6.04732200352226e-08, "loss": 0.0081, "reward": 2.116187572479248, "reward_std": 0.5654011368751526, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.44431251287460327, "step": 5253 }, { "completion_length": 127.84375, "epoch": 2.8111289459604065, "grad_norm": 11.77021598815918, "kl": 1.7310041189193726, "learning_rate": 6.013342036564395e-08, "loss": 0.0692, "reward": 2.1191563606262207, "reward_std": 1.0242679119110107, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.46290624141693115, "step": 5254 }, { "completion_length": 144.53125, "epoch": 2.8116639914392723, "grad_norm": 0.39253321290016174, "kl": 0.1274821013212204, "learning_rate": 5.979456643510573e-08, "loss": 0.0051, "reward": 1.610031247138977, "reward_std": 0.7615969181060791, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.43815624713897705, "step": 5255 }, { "completion_length": 122.4375, "epoch": 2.812199036918138, "grad_norm": 2.5276920795440674, "kl": 0.18066254258155823, "learning_rate": 5.945665837495423e-08, "loss": 0.0072, "reward": 2.2125625610351562, "reward_std": 0.7070975303649902, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4469375014305115, "step": 5256 }, { "completion_length": 148.4375, "epoch": 2.8127340823970037, "grad_norm": 0.7953068017959595, "kl": 0.18770599365234375, "learning_rate": 5.911969631616915e-08, "loss": 0.0075, "reward": 1.7225937843322754, "reward_std": 0.8766738772392273, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.441343754529953, "step": 5257 }, { "completion_length": 121.21875, "epoch": 2.8132691278758695, "grad_norm": 0.6580601334571838, "kl": 0.1618155837059021, "learning_rate": 5.878368038936349e-08, "loss": 0.0065, "reward": 1.8359375, "reward_std": 0.757379412651062, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4765625, "step": 5258 }, { "completion_length": 109.90625, "epoch": 2.813804173354735, "grad_norm": 2.6307644844055176, "kl": 0.40756624937057495, "learning_rate": 5.844861072478336e-08, "loss": 0.0163, "reward": 2.80078125, "reward_std": 1.0178265571594238, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48828125, "step": 5259 }, { "completion_length": 134.8125, "epoch": 2.814339218833601, "grad_norm": 0.8261286020278931, "kl": 0.1494063138961792, "learning_rate": 5.811448745230819e-08, "loss": 0.006, "reward": 2.0921249389648438, "reward_std": 0.8792730569839478, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4358749985694885, "step": 5260 }, { "completion_length": 123.8125, "epoch": 2.8148742643124667, "grad_norm": 1.3388288021087646, "kl": 0.16114908456802368, "learning_rate": 5.778131070145077e-08, "loss": 0.0064, "reward": 2.8124375343322754, "reward_std": 0.957603394985199, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.499937504529953, "step": 5261 }, { "completion_length": 120.21875, "epoch": 2.8154093097913324, "grad_norm": 1.3914498090744019, "kl": 0.24941816926002502, "learning_rate": 5.74490806013564e-08, "loss": 0.01, "reward": 1.924843668937683, "reward_std": 0.4966542720794678, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.48734375834465027, "step": 5262 }, { "completion_length": 129.09375, "epoch": 2.815944355270198, "grad_norm": 1.1650738716125488, "kl": 0.18389523029327393, "learning_rate": 5.711779728080402e-08, "loss": 0.0074, "reward": 1.764968752861023, "reward_std": 0.6709527969360352, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46809375286102295, "step": 5263 }, { "completion_length": 138.625, "epoch": 2.8164794007490634, "grad_norm": 66.94824981689453, "kl": 1.3051894903182983, "learning_rate": 5.678746086820563e-08, "loss": 0.0522, "reward": 2.0594375133514404, "reward_std": 1.2070748805999756, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.41881251335144043, "step": 5264 }, { "completion_length": 125.46875, "epoch": 2.8170144462279296, "grad_norm": 0.8149768710136414, "kl": 0.15472088754177094, "learning_rate": 5.645807149160548e-08, "loss": 0.0062, "reward": 2.09765625, "reward_std": 0.590788722038269, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48828125, "step": 5265 }, { "completion_length": 140.84375, "epoch": 2.817549491706795, "grad_norm": 1.0961785316467285, "kl": 0.223457932472229, "learning_rate": 5.6129629278681706e-08, "loss": 0.0089, "reward": 2.1259374618530273, "reward_std": 1.3045389652252197, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.42281246185302734, "step": 5266 }, { "completion_length": 114.0, "epoch": 2.8180845371856607, "grad_norm": 2.596099853515625, "kl": 0.21496275067329407, "learning_rate": 5.5802134356744696e-08, "loss": 0.0086, "reward": 1.9027187824249268, "reward_std": 0.4199967086315155, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46521875262260437, "step": 5267 }, { "completion_length": 142.65625, "epoch": 2.8186195826645264, "grad_norm": 0.5567777752876282, "kl": 0.16790609061717987, "learning_rate": 5.547558685273707e-08, "loss": 0.0067, "reward": 2.2295312881469727, "reward_std": 0.8144686818122864, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4639062285423279, "step": 5268 }, { "completion_length": 119.65625, "epoch": 2.819154628143392, "grad_norm": 1.1395130157470703, "kl": 0.21370723843574524, "learning_rate": 5.514998689323592e-08, "loss": 0.0085, "reward": 2.157656192779541, "reward_std": 0.8696186542510986, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4545312523841858, "step": 5269 }, { "completion_length": 129.65625, "epoch": 2.819689673622258, "grad_norm": 1.5753803253173828, "kl": 0.23991593718528748, "learning_rate": 5.4825334604449445e-08, "loss": 0.0096, "reward": 1.8556876182556152, "reward_std": 0.4825488030910492, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4494374990463257, "step": 5270 }, { "completion_length": 127.46875, "epoch": 2.8202247191011236, "grad_norm": 0.915970504283905, "kl": 0.15175262093544006, "learning_rate": 5.450163011221921e-08, "loss": 0.0061, "reward": 1.7854375839233398, "reward_std": 0.6703276634216309, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.45731252431869507, "step": 5271 }, { "completion_length": 124.28125, "epoch": 2.8207597645799893, "grad_norm": 1.5362920761108398, "kl": 0.294385701417923, "learning_rate": 5.417887354201928e-08, "loss": 0.0118, "reward": 1.765625, "reward_std": 0.5476343631744385, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.5, "step": 5272 }, { "completion_length": 120.84375, "epoch": 2.821294810058855, "grad_norm": 1.5345815420150757, "kl": 0.16652613878250122, "learning_rate": 5.385706501895627e-08, "loss": 0.0067, "reward": 2.1440937519073486, "reward_std": 0.4536687135696411, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48784375190734863, "step": 5273 }, { "completion_length": 116.25, "epoch": 2.821829855537721, "grad_norm": 0.8198546767234802, "kl": 0.23569470643997192, "learning_rate": 5.3536204667769544e-08, "loss": 0.0094, "reward": 1.755406141281128, "reward_std": 0.32828477025032043, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4897812604904175, "step": 5274 }, { "completion_length": 129.6875, "epoch": 2.8223649010165865, "grad_norm": 0.7813478708267212, "kl": 0.22484123706817627, "learning_rate": 5.3216292612830176e-08, "loss": 0.009, "reward": 1.9977812767028809, "reward_std": 0.8289967775344849, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.43528124690055847, "step": 5275 }, { "completion_length": 126.8125, "epoch": 2.8228999464954523, "grad_norm": 0.6968904733657837, "kl": 0.16961966454982758, "learning_rate": 5.289732897814287e-08, "loss": 0.0068, "reward": 2.546875, "reward_std": 0.5702300071716309, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.5, "step": 5276 }, { "completion_length": 141.96875, "epoch": 2.8234349919743176, "grad_norm": 1.6031126976013184, "kl": 0.1965203881263733, "learning_rate": 5.2579313887343445e-08, "loss": 0.0079, "reward": 2.28125, "reward_std": 0.7871425151824951, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.5, "step": 5277 }, { "completion_length": 114.15625, "epoch": 2.8239700374531838, "grad_norm": 2.048574209213257, "kl": 0.16505232453346252, "learning_rate": 5.2262247463701064e-08, "loss": 0.0066, "reward": 2.5035312175750732, "reward_std": 0.6089584827423096, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48790624737739563, "step": 5278 }, { "completion_length": 132.0, "epoch": 2.824505082932049, "grad_norm": 1.4130717515945435, "kl": 0.1575373411178589, "learning_rate": 5.19461298301166e-08, "loss": 0.0063, "reward": 2.3667187690734863, "reward_std": 0.8295921683311462, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.44484376907348633, "step": 5279 }, { "completion_length": 139.9375, "epoch": 2.825040128410915, "grad_norm": 0.8322550058364868, "kl": 0.2320030927658081, "learning_rate": 5.163096110912369e-08, "loss": 0.0093, "reward": 1.9296875, "reward_std": 0.9164282083511353, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4765625, "step": 5280 }, { "completion_length": 110.6875, "epoch": 2.8255751738897805, "grad_norm": 2.6458444595336914, "kl": 0.2643333673477173, "learning_rate": 5.131674142288684e-08, "loss": 0.0106, "reward": 2.5038750171661377, "reward_std": 0.7282078266143799, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4882499873638153, "step": 5281 }, { "completion_length": 133.96875, "epoch": 2.8261102193686463, "grad_norm": 0.8143540024757385, "kl": 0.19371087849140167, "learning_rate": 5.1003470893204456e-08, "loss": 0.0077, "reward": 1.4073437452316284, "reward_std": 0.3797375559806824, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4854687452316284, "step": 5282 }, { "completion_length": 153.65625, "epoch": 2.826645264847512, "grad_norm": 1.162943720817566, "kl": 0.17355653643608093, "learning_rate": 5.0691149641506065e-08, "loss": 0.0069, "reward": 2.145625114440918, "reward_std": 1.0683717727661133, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4424999952316284, "step": 5283 }, { "completion_length": 102.25, "epoch": 2.8271803103263777, "grad_norm": 1.311193585395813, "kl": 0.21787647902965546, "learning_rate": 5.037977778885317e-08, "loss": 0.0087, "reward": 3.2133126258850098, "reward_std": 0.5193312168121338, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4945625066757202, "step": 5284 }, { "completion_length": 125.03125, "epoch": 2.8277153558052435, "grad_norm": 1.4151009321212769, "kl": 0.2290123552083969, "learning_rate": 5.0069355455939215e-08, "loss": 0.0092, "reward": 2.36328125, "reward_std": 0.5690770745277405, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47265625, "step": 5285 }, { "completion_length": 145.875, "epoch": 2.828250401284109, "grad_norm": 0.7760235071182251, "kl": 0.17930728197097778, "learning_rate": 4.975988276309046e-08, "loss": 0.0072, "reward": 1.6888749599456787, "reward_std": 0.6991694569587708, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4701249897480011, "step": 5286 }, { "completion_length": 115.125, "epoch": 2.828785446762975, "grad_norm": 0.811809778213501, "kl": 0.19329318404197693, "learning_rate": 4.9451359830264e-08, "loss": 0.0077, "reward": 1.8795000314712524, "reward_std": 0.6242221593856812, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.48887500166893005, "step": 5287 }, { "completion_length": 117.46875, "epoch": 2.8293204922418407, "grad_norm": 1.1492286920547485, "kl": 0.18873611092567444, "learning_rate": 4.914378677704945e-08, "loss": 0.0075, "reward": 1.8841874599456787, "reward_std": 0.24499526619911194, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4779375195503235, "step": 5288 }, { "completion_length": 108.90625, "epoch": 2.8298555377207064, "grad_norm": 1.0064589977264404, "kl": 0.1903294026851654, "learning_rate": 4.8837163722668114e-08, "loss": 0.0076, "reward": 2.1204686164855957, "reward_std": 0.43527793884277344, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.49546873569488525, "step": 5289 }, { "completion_length": 128.46875, "epoch": 2.8303905831995717, "grad_norm": 0.7240495681762695, "kl": 0.21997351944446564, "learning_rate": 4.853149078597241e-08, "loss": 0.0088, "reward": 2.3513126373291016, "reward_std": 0.7840808033943176, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4763124883174896, "step": 5290 }, { "completion_length": 113.125, "epoch": 2.830925628678438, "grad_norm": 0.8592776656150818, "kl": 0.1975671797990799, "learning_rate": 4.822676808544785e-08, "loss": 0.0079, "reward": 2.312375068664551, "reward_std": 0.32715150713920593, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.499875009059906, "step": 5291 }, { "completion_length": 121.4375, "epoch": 2.831460674157303, "grad_norm": 1.0644205808639526, "kl": 0.23591428995132446, "learning_rate": 4.792299573921022e-08, "loss": 0.0094, "reward": 1.7599375247955322, "reward_std": 0.7909438014030457, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.49431249499320984, "step": 5292 }, { "completion_length": 143.84375, "epoch": 2.831995719636169, "grad_norm": 0.8009401559829712, "kl": 0.15020456910133362, "learning_rate": 4.762017386500756e-08, "loss": 0.006, "reward": 1.6714999675750732, "reward_std": 0.7378925085067749, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.45274999737739563, "step": 5293 }, { "completion_length": 133.53125, "epoch": 2.8325307651150347, "grad_norm": 1.1631429195404053, "kl": 0.19266903400421143, "learning_rate": 4.731830258021958e-08, "loss": 0.0077, "reward": 1.8868436813354492, "reward_std": 0.7183631062507629, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.449343740940094, "step": 5294 }, { "completion_length": 113.0, "epoch": 2.8330658105939004, "grad_norm": 0.6802346706390381, "kl": 0.1931697130203247, "learning_rate": 4.7017382001857126e-08, "loss": 0.0077, "reward": 2.8125, "reward_std": 0.8092666864395142, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.5, "step": 5295 }, { "completion_length": 137.78125, "epoch": 2.833600856072766, "grad_norm": 0.7624483704566956, "kl": 0.1518547534942627, "learning_rate": 4.671741224656301e-08, "loss": 0.0061, "reward": 2.2417500019073486, "reward_std": 0.7084592580795288, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47612500190734863, "step": 5296 }, { "completion_length": 117.875, "epoch": 2.834135901551632, "grad_norm": 1.124240517616272, "kl": 0.1897401362657547, "learning_rate": 4.641839343061144e-08, "loss": 0.0076, "reward": 1.7482187747955322, "reward_std": 0.3327561616897583, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.48259374499320984, "step": 5297 }, { "completion_length": 127.125, "epoch": 2.8346709470304976, "grad_norm": 1.625559687614441, "kl": 0.17902231216430664, "learning_rate": 4.6120325669907485e-08, "loss": 0.0072, "reward": 1.4755938053131104, "reward_std": 0.7538501024246216, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4599687457084656, "step": 5298 }, { "completion_length": 119.34375, "epoch": 2.8352059925093633, "grad_norm": 2.9853596687316895, "kl": 0.2605074346065521, "learning_rate": 4.582320907998816e-08, "loss": 0.0104, "reward": 1.9183125495910645, "reward_std": 0.842278778553009, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4808124899864197, "step": 5299 }, { "completion_length": 128.9375, "epoch": 2.835741037988229, "grad_norm": 1.2473692893981934, "kl": 0.16903632879257202, "learning_rate": 4.552704377602135e-08, "loss": 0.0068, "reward": 2.16015625, "reward_std": 0.6271954774856567, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.48828125, "step": 5300 }, { "completion_length": 118.15625, "epoch": 2.836276083467095, "grad_norm": 0.9375525116920471, "kl": 0.16136230528354645, "learning_rate": 4.523182987280633e-08, "loss": 0.0065, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 5301 }, { "completion_length": 155.875, "epoch": 2.8368111289459605, "grad_norm": 0.9200294017791748, "kl": 0.14945420622825623, "learning_rate": 4.493756748477407e-08, "loss": 0.006, "reward": 1.2953437566757202, "reward_std": 0.7692956924438477, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4203437566757202, "step": 5302 }, { "completion_length": 142.53125, "epoch": 2.837346174424826, "grad_norm": 1.5786949396133423, "kl": 0.3026783764362335, "learning_rate": 4.4644256725985826e-08, "loss": 0.0121, "reward": 1.6268436908721924, "reward_std": 0.8434776663780212, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.45496875047683716, "step": 5303 }, { "completion_length": 125.375, "epoch": 2.837881219903692, "grad_norm": 13835.52734375, "kl": 73.19841766357422, "learning_rate": 4.4351897710135094e-08, "loss": 2.9279, "reward": 2.132937431335449, "reward_std": 0.11748087406158447, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.398562490940094, "step": 5304 }, { "completion_length": 123.40625, "epoch": 2.8384162653825573, "grad_norm": 3.357532262802124, "kl": 0.19073139131069183, "learning_rate": 4.406049055054512e-08, "loss": 0.0076, "reward": 2.2126874923706055, "reward_std": 0.6239534616470337, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.46268749237060547, "step": 5305 }, { "completion_length": 113.28125, "epoch": 2.8389513108614235, "grad_norm": 1.0816181898117065, "kl": 0.2587814927101135, "learning_rate": 4.3770035360171104e-08, "loss": 0.0104, "reward": 2.794875144958496, "reward_std": 0.5219444632530212, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48237499594688416, "step": 5306 }, { "completion_length": 128.0, "epoch": 2.839486356340289, "grad_norm": 0.47150182723999023, "kl": 0.12812986969947815, "learning_rate": 4.348053225159965e-08, "loss": 0.0051, "reward": 1.6060937643051147, "reward_std": 0.42113733291625977, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48109376430511475, "step": 5307 }, { "completion_length": 126.96875, "epoch": 2.8400214018191545, "grad_norm": 0.8921065330505371, "kl": 0.20448151230812073, "learning_rate": 4.319198133704683e-08, "loss": 0.0082, "reward": 2.51953125, "reward_std": 0.7969306707382202, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47265625, "step": 5308 }, { "completion_length": 115.15625, "epoch": 2.8405564472980203, "grad_norm": 4.202129364013672, "kl": 0.18537503480911255, "learning_rate": 4.290438272836095e-08, "loss": 0.0074, "reward": 2.729750156402588, "reward_std": 0.7720180749893188, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.47975000739097595, "step": 5309 }, { "completion_length": 129.125, "epoch": 2.841091492776886, "grad_norm": 1.0007938146591187, "kl": 0.19448181986808777, "learning_rate": 4.2617736537020894e-08, "loss": 0.0078, "reward": 2.074312448501587, "reward_std": 0.41724804043769836, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4805625081062317, "step": 5310 }, { "completion_length": 127.46875, "epoch": 2.8416265382557517, "grad_norm": 0.6807436347007751, "kl": 0.16723451018333435, "learning_rate": 4.2332042874135836e-08, "loss": 0.0067, "reward": 1.645218849182129, "reward_std": 0.7278447151184082, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.44209372997283936, "step": 5311 }, { "completion_length": 123.34375, "epoch": 2.8421615837346175, "grad_norm": 0.9989736080169678, "kl": 0.20183107256889343, "learning_rate": 4.2047301850446364e-08, "loss": 0.0081, "reward": 2.28334379196167, "reward_std": 0.5827420353889465, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47084376215934753, "step": 5312 }, { "completion_length": 142.0625, "epoch": 2.842696629213483, "grad_norm": 1.4107519388198853, "kl": 0.3108995854854584, "learning_rate": 4.1763513576323636e-08, "loss": 0.0124, "reward": 1.4410624504089355, "reward_std": 0.7973449230194092, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4410625100135803, "step": 5313 }, { "completion_length": 129.28125, "epoch": 2.843231674692349, "grad_norm": 1.005761981010437, "kl": 0.20311908423900604, "learning_rate": 4.1480678161769094e-08, "loss": 0.0081, "reward": 2.041781425476074, "reward_std": 0.8767143487930298, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4636562764644623, "step": 5314 }, { "completion_length": 121.40625, "epoch": 2.8437667201712147, "grad_norm": 1.054537057876587, "kl": 0.19197124242782593, "learning_rate": 4.1198795716415326e-08, "loss": 0.0077, "reward": 2.7211248874664307, "reward_std": 0.8678736090660095, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4711250066757202, "step": 5315 }, { "completion_length": 140.1875, "epoch": 2.8443017656500804, "grad_norm": 0.42945432662963867, "kl": 0.1333787441253662, "learning_rate": 4.091786634952549e-08, "loss": 0.0053, "reward": 1.3491562604904175, "reward_std": 0.255262553691864, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4585312604904175, "step": 5316 }, { "completion_length": 130.53125, "epoch": 2.844836811128946, "grad_norm": 0.6490888595581055, "kl": 0.17944560945034027, "learning_rate": 4.063789016999331e-08, "loss": 0.0072, "reward": 2.063687562942505, "reward_std": 0.758840799331665, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4699375033378601, "step": 5317 }, { "completion_length": 140.15625, "epoch": 2.8453718566078114, "grad_norm": 1.1168320178985596, "kl": 0.1967804729938507, "learning_rate": 4.035886728634225e-08, "loss": 0.0079, "reward": 1.816812515258789, "reward_std": 0.9010457992553711, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48868751525878906, "step": 5318 }, { "completion_length": 149.03125, "epoch": 2.8459069020866776, "grad_norm": 1.0877423286437988, "kl": 0.2012765109539032, "learning_rate": 4.008079780672774e-08, "loss": 0.0081, "reward": 1.5343437194824219, "reward_std": 0.3691214919090271, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.40934374928474426, "step": 5319 }, { "completion_length": 139.0625, "epoch": 2.846441947565543, "grad_norm": 0.9692879319190979, "kl": 0.15488126873970032, "learning_rate": 3.9803681838934404e-08, "loss": 0.0062, "reward": 1.5948125123977661, "reward_std": 0.7026493549346924, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4073125123977661, "step": 5320 }, { "completion_length": 141.78125, "epoch": 2.8469769930444087, "grad_norm": 1.0728727579116821, "kl": 0.2197403609752655, "learning_rate": 3.952751949037742e-08, "loss": 0.0088, "reward": 1.825812578201294, "reward_std": 0.35704076290130615, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.45081251859664917, "step": 5321 }, { "completion_length": 120.21875, "epoch": 2.8475120385232744, "grad_norm": 1.0049901008605957, "kl": 0.16163258254528046, "learning_rate": 3.925231086810338e-08, "loss": 0.0065, "reward": 2.084843635559082, "reward_std": 1.0217936038970947, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4910937547683716, "step": 5322 }, { "completion_length": 124.4375, "epoch": 2.84804708400214, "grad_norm": 1.1839349269866943, "kl": 0.1705305427312851, "learning_rate": 3.897805607878807e-08, "loss": 0.0068, "reward": 2.3125, "reward_std": 0.8533962965011597, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46875, "step": 5323 }, { "completion_length": 170.03125, "epoch": 2.848582129481006, "grad_norm": 1.2261102199554443, "kl": 0.12796065211296082, "learning_rate": 3.870475522873729e-08, "loss": 0.0051, "reward": 1.5313125848770142, "reward_std": 0.9946655035018921, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.375062495470047, "step": 5324 }, { "completion_length": 135.09375, "epoch": 2.8491171749598716, "grad_norm": 0.7808442115783691, "kl": 0.20514503121376038, "learning_rate": 3.843240842388879e-08, "loss": 0.0082, "reward": 1.350406289100647, "reward_std": 0.22540543973445892, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4754062592983246, "step": 5325 }, { "completion_length": 133.1875, "epoch": 2.8496522204387373, "grad_norm": 0.5729094743728638, "kl": 0.13652126491069794, "learning_rate": 3.8161015769808684e-08, "loss": 0.0055, "reward": 1.7409687042236328, "reward_std": 1.0402214527130127, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4440937638282776, "step": 5326 }, { "completion_length": 130.53125, "epoch": 2.850187265917603, "grad_norm": 1.575927495956421, "kl": 0.2089923918247223, "learning_rate": 3.7890577371694216e-08, "loss": 0.0084, "reward": 1.904343843460083, "reward_std": 0.7028074860572815, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.41996872425079346, "step": 5327 }, { "completion_length": 136.5625, "epoch": 2.850722311396469, "grad_norm": 1.6734064817428589, "kl": 0.1582697033882141, "learning_rate": 3.762109333437208e-08, "loss": 0.0063, "reward": 1.9728437662124634, "reward_std": 0.7157760262489319, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4259687662124634, "step": 5328 }, { "completion_length": 134.71875, "epoch": 2.8512573568753345, "grad_norm": 1.035804271697998, "kl": 0.2707524001598358, "learning_rate": 3.735256376230012e-08, "loss": 0.0108, "reward": 1.7710624933242798, "reward_std": 0.5057854652404785, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4585624933242798, "step": 5329 }, { "completion_length": 104.3125, "epoch": 2.8517924023542003, "grad_norm": 324.79083251953125, "kl": 35.08738708496094, "learning_rate": 3.708498875956506e-08, "loss": 1.4035, "reward": 2.4023125171661377, "reward_std": 0.7387837171554565, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.4960625171661377, "step": 5330 }, { "completion_length": 148.0, "epoch": 2.8523274478330656, "grad_norm": 2.237314224243164, "kl": 0.1954718828201294, "learning_rate": 3.681836842988423e-08, "loss": 0.0078, "reward": 1.9879686832427979, "reward_std": 0.9130182266235352, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4410937428474426, "step": 5331 }, { "completion_length": 116.75, "epoch": 2.8528624933119318, "grad_norm": 0.4828655421733856, "kl": 0.2037166953086853, "learning_rate": 3.655270287660495e-08, "loss": 0.0081, "reward": 2.282562494277954, "reward_std": 0.34887269139289856, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4856874942779541, "step": 5332 }, { "completion_length": 104.71875, "epoch": 2.853397538790797, "grad_norm": 0.6077759265899658, "kl": 0.2915610671043396, "learning_rate": 3.6287992202704015e-08, "loss": 0.0117, "reward": 2.3439061641693115, "reward_std": 0.4936973750591278, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4689062535762787, "step": 5333 }, { "completion_length": 132.15625, "epoch": 2.853932584269663, "grad_norm": 1.3051834106445312, "kl": 0.246374249458313, "learning_rate": 3.602423651078824e-08, "loss": 0.0099, "reward": 1.959625005722046, "reward_std": 1.0128631591796875, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4440000057220459, "step": 5334 }, { "completion_length": 131.875, "epoch": 2.8544676297485285, "grad_norm": 1.281446099281311, "kl": 0.18408004939556122, "learning_rate": 3.5761435903094996e-08, "loss": 0.0074, "reward": 1.809499979019165, "reward_std": 0.8191444873809814, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4032500088214874, "step": 5335 }, { "completion_length": 126.34375, "epoch": 2.8550026752273943, "grad_norm": 2.651001453399658, "kl": 0.40000176429748535, "learning_rate": 3.549959048149032e-08, "loss": 0.016, "reward": 1.6409063339233398, "reward_std": 0.48804017901420593, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.45340627431869507, "step": 5336 }, { "completion_length": 120.09375, "epoch": 2.85553772070626, "grad_norm": 0.817825198173523, "kl": 0.21070724725723267, "learning_rate": 3.523870034747051e-08, "loss": 0.0084, "reward": 1.908250093460083, "reward_std": 0.8861656785011292, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.45512500405311584, "step": 5337 }, { "completion_length": 127.53125, "epoch": 2.8560727661851257, "grad_norm": 0.6330524682998657, "kl": 0.17215120792388916, "learning_rate": 3.49787656021619e-08, "loss": 0.0069, "reward": 2.448625087738037, "reward_std": 0.7021993398666382, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47987499833106995, "step": 5338 }, { "completion_length": 128.90625, "epoch": 2.8566078116639915, "grad_norm": 0.877845287322998, "kl": 0.21235023438930511, "learning_rate": 3.471978634631973e-08, "loss": 0.0085, "reward": 2.1450936794281006, "reward_std": 0.48717033863067627, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48884373903274536, "step": 5339 }, { "completion_length": 116.90625, "epoch": 2.857142857142857, "grad_norm": 1.266168236732483, "kl": 0.1887354552745819, "learning_rate": 3.44617626803298e-08, "loss": 0.0075, "reward": 2.620281219482422, "reward_std": 1.0620248317718506, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46403124928474426, "step": 5340 }, { "completion_length": 116.875, "epoch": 2.857677902621723, "grad_norm": 2.275472640991211, "kl": 0.21108882129192352, "learning_rate": 3.420469470420684e-08, "loss": 0.0084, "reward": 2.3531250953674316, "reward_std": 0.9743537902832031, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4781250059604645, "step": 5341 }, { "completion_length": 135.53125, "epoch": 2.8582129481005887, "grad_norm": 0.7082862257957458, "kl": 0.17376111447811127, "learning_rate": 3.394858251759503e-08, "loss": 0.007, "reward": 1.9866561889648438, "reward_std": 0.695465087890625, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4085312485694885, "step": 5342 }, { "completion_length": 112.6875, "epoch": 2.8587479935794544, "grad_norm": 269696608.0, "kl": 163363.015625, "learning_rate": 3.3693426219768845e-08, "loss": 6534.521, "reward": 2.2995312213897705, "reward_std": 0.41369956731796265, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4714062511920929, "step": 5343 }, { "completion_length": 140.4375, "epoch": 2.8592830390583197, "grad_norm": 0.5103247165679932, "kl": 0.14377176761627197, "learning_rate": 3.343922590963167e-08, "loss": 0.0058, "reward": 1.7070624828338623, "reward_std": 0.6264945268630981, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4570624828338623, "step": 5344 }, { "completion_length": 141.03125, "epoch": 2.859818084537186, "grad_norm": 1.342641830444336, "kl": 0.1949666440486908, "learning_rate": 3.318598168571607e-08, "loss": 0.0078, "reward": 1.7882499694824219, "reward_std": 0.8471451997756958, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46012499928474426, "step": 5345 }, { "completion_length": 141.1875, "epoch": 2.860353130016051, "grad_norm": 1.8654282093048096, "kl": 0.1924196034669876, "learning_rate": 3.293369364618465e-08, "loss": 0.0077, "reward": 1.9877500534057617, "reward_std": 1.074826717376709, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.42524999380111694, "step": 5346 }, { "completion_length": 149.125, "epoch": 2.860888175494917, "grad_norm": 204391808.0, "kl": 1379442.375, "learning_rate": 3.2682361888828895e-08, "loss": 55177.6914, "reward": 1.9729374647140503, "reward_std": 0.5086597800254822, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4573124945163727, "step": 5347 }, { "completion_length": 154.59375, "epoch": 2.8614232209737827, "grad_norm": 1.3936396837234497, "kl": 0.17529848217964172, "learning_rate": 3.2431986511070045e-08, "loss": 0.007, "reward": 1.3909375667572021, "reward_std": 0.8936692476272583, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4065625071525574, "step": 5348 }, { "completion_length": 136.03125, "epoch": 2.8619582664526484, "grad_norm": 1.3180887699127197, "kl": 0.172896608710289, "learning_rate": 3.218256760995825e-08, "loss": 0.0069, "reward": 2.3945000171661377, "reward_std": 0.9800341725349426, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4569999873638153, "step": 5349 }, { "completion_length": 129.375, "epoch": 2.862493311931514, "grad_norm": 1.0350496768951416, "kl": 0.19320139288902283, "learning_rate": 3.193410528217311e-08, "loss": 0.0077, "reward": 3.109250068664551, "reward_std": 0.6200621128082275, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.499875009059906, "step": 5350 }, { "completion_length": 108.5, "epoch": 2.86302835741038, "grad_norm": 1.7405471801757812, "kl": 0.23844891786575317, "learning_rate": 3.168659962402343e-08, "loss": 0.0095, "reward": 2.755906105041504, "reward_std": 0.971050500869751, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47465625405311584, "step": 5351 }, { "completion_length": 121.75, "epoch": 2.8635634028892456, "grad_norm": 1.6113332509994507, "kl": 0.22675544023513794, "learning_rate": 3.1440050731446624e-08, "loss": 0.0091, "reward": 3.0766875743865967, "reward_std": 0.6700991988182068, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4985625147819519, "step": 5352 }, { "completion_length": 106.625, "epoch": 2.8640984483681113, "grad_norm": 0.46941009163856506, "kl": 0.20772108435630798, "learning_rate": 3.119445870001098e-08, "loss": 0.0083, "reward": 2.8212499618530273, "reward_std": 0.4348289370536804, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.49312499165534973, "step": 5353 }, { "completion_length": 129.34375, "epoch": 2.864633493846977, "grad_norm": 4.072901725769043, "kl": 0.2503618597984314, "learning_rate": 3.0949823624911165e-08, "loss": 0.01, "reward": 1.8074374198913574, "reward_std": 0.7270692586898804, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4793124794960022, "step": 5354 }, { "completion_length": 119.65625, "epoch": 2.865168539325843, "grad_norm": 0.7168227434158325, "kl": 0.14126567542552948, "learning_rate": 3.070614560097357e-08, "loss": 0.0057, "reward": 2.406125068664551, "reward_std": 0.6529779434204102, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.499875009059906, "step": 5355 }, { "completion_length": 141.46875, "epoch": 2.8657035848047085, "grad_norm": 0.6084322929382324, "kl": 0.13460244238376617, "learning_rate": 3.046342472265207e-08, "loss": 0.0054, "reward": 1.8172500133514404, "reward_std": 0.7795020341873169, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.44225001335144043, "step": 5356 }, { "completion_length": 125.9375, "epoch": 2.866238630283574, "grad_norm": 0.7518742680549622, "kl": 0.22443042695522308, "learning_rate": 3.0221661084029743e-08, "loss": 0.009, "reward": 2.2080936431884766, "reward_std": 0.7539709806442261, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4737187623977661, "step": 5357 }, { "completion_length": 117.0, "epoch": 2.86677367576244, "grad_norm": 3.408531427383423, "kl": 0.3872414529323578, "learning_rate": 2.998085477881912e-08, "loss": 0.0155, "reward": 2.504218578338623, "reward_std": 0.9366284608840942, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4573437571525574, "step": 5358 }, { "completion_length": 128.84375, "epoch": 2.8673087212413053, "grad_norm": 742.3353881835938, "kl": 117.38546752929688, "learning_rate": 2.974100590036111e-08, "loss": 4.6954, "reward": 1.7887499332427979, "reward_std": 0.7001120448112488, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4762499928474426, "step": 5359 }, { "completion_length": 121.5625, "epoch": 2.867843766720171, "grad_norm": 1.8885862827301025, "kl": 0.247654989361763, "learning_rate": 2.9502114541626048e-08, "loss": 0.0099, "reward": 2.603562593460083, "reward_std": 0.6440777778625488, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46293747425079346, "step": 5360 }, { "completion_length": 110.4375, "epoch": 2.868378812199037, "grad_norm": 1.1157495975494385, "kl": 0.21196886897087097, "learning_rate": 2.9264180795212373e-08, "loss": 0.0085, "reward": 2.2055001258850098, "reward_std": 0.4191952049732208, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4867500066757202, "step": 5361 }, { "completion_length": 140.28125, "epoch": 2.8689138576779025, "grad_norm": 1.3891948461532593, "kl": 0.18920564651489258, "learning_rate": 2.902720475334797e-08, "loss": 0.0076, "reward": 1.7129062414169312, "reward_std": 0.5301632881164551, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.44728127121925354, "step": 5362 }, { "completion_length": 138.1875, "epoch": 2.8694489031567683, "grad_norm": 2.2965660095214844, "kl": 0.19909662008285522, "learning_rate": 2.879118650788937e-08, "loss": 0.008, "reward": 1.7265625, "reward_std": 0.5166469812393188, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4765625, "step": 5363 }, { "completion_length": 142.28125, "epoch": 2.869983948635634, "grad_norm": 4.877506256103516, "kl": 0.31729814410209656, "learning_rate": 2.8556126150321718e-08, "loss": 0.0127, "reward": 1.515625, "reward_std": 0.5919738411903381, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.453125, "step": 5364 }, { "completion_length": 125.0625, "epoch": 2.8705189941144997, "grad_norm": 0.5761368870735168, "kl": 0.14892694354057312, "learning_rate": 2.83220237717588e-08, "loss": 0.006, "reward": 2.6718125343322754, "reward_std": 1.0391860008239746, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.499937504529953, "step": 5365 }, { "completion_length": 118.8125, "epoch": 2.8710540395933655, "grad_norm": 1.8560787439346313, "kl": 0.24102270603179932, "learning_rate": 2.808887946294331e-08, "loss": 0.0096, "reward": 2.1513123512268066, "reward_std": 0.9940700531005859, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4638125002384186, "step": 5366 }, { "completion_length": 131.9375, "epoch": 2.871589085072231, "grad_norm": 0.5873785018920898, "kl": 0.1635696440935135, "learning_rate": 2.7856693314246286e-08, "loss": 0.0065, "reward": 2.676187515258789, "reward_std": 0.9028197526931763, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.47306251525878906, "step": 5367 }, { "completion_length": 123.53125, "epoch": 2.872124130551097, "grad_norm": 14.892072677612305, "kl": 1.172065019607544, "learning_rate": 2.7625465415667962e-08, "loss": 0.0469, "reward": 1.8997187614440918, "reward_std": 0.6942830085754395, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4934687614440918, "step": 5368 }, { "completion_length": 120.84375, "epoch": 2.8726591760299627, "grad_norm": 0.7488623857498169, "kl": 0.17439015209674835, "learning_rate": 2.7395195856836365e-08, "loss": 0.007, "reward": 2.8433749675750732, "reward_std": 0.3021507263183594, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.49962499737739563, "step": 5369 }, { "completion_length": 138.21875, "epoch": 2.8731942215088284, "grad_norm": 1.9452983140945435, "kl": 0.20926687121391296, "learning_rate": 2.7165884727008153e-08, "loss": 0.0084, "reward": 1.8254687786102295, "reward_std": 0.5602195858955383, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4348437786102295, "step": 5370 }, { "completion_length": 143.8125, "epoch": 2.873729266987694, "grad_norm": 1.5779542922973633, "kl": 0.2074439972639084, "learning_rate": 2.6937532115069164e-08, "loss": 0.0083, "reward": 2.012406349182129, "reward_std": 0.916504979133606, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.40303122997283936, "step": 5371 }, { "completion_length": 124.5, "epoch": 2.8742643124665594, "grad_norm": 0.6307433843612671, "kl": 0.16310471296310425, "learning_rate": 2.6710138109533046e-08, "loss": 0.0065, "reward": 2.8448123931884766, "reward_std": 0.9295039772987366, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4698125123977661, "step": 5372 }, { "completion_length": 125.0625, "epoch": 2.8747993579454256, "grad_norm": 6.122673511505127, "kl": 0.8530607223510742, "learning_rate": 2.6483702798542066e-08, "loss": 0.0341, "reward": 2.975062370300293, "reward_std": 0.9119141101837158, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.4906874895095825, "step": 5373 }, { "completion_length": 135.875, "epoch": 2.875334403424291, "grad_norm": 1.415575385093689, "kl": 0.2798727750778198, "learning_rate": 2.625822626986685e-08, "loss": 0.0112, "reward": 1.9122188091278076, "reward_std": 0.9874943494796753, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.47471874952316284, "step": 5374 }, { "completion_length": 142.21875, "epoch": 2.8758694489031567, "grad_norm": 0.6412013173103333, "kl": 0.19189900159835815, "learning_rate": 2.6033708610906925e-08, "loss": 0.0077, "reward": 1.3305312395095825, "reward_std": 0.5197685360908508, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4555312395095825, "step": 5375 }, { "completion_length": 112.65625, "epoch": 2.8764044943820224, "grad_norm": 0.5031482577323914, "kl": 0.1752219945192337, "learning_rate": 2.58101499086888e-08, "loss": 0.007, "reward": 2.1377501487731934, "reward_std": 0.8773393630981445, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4814999997615814, "step": 5376 }, { "completion_length": 123.3125, "epoch": 2.876939539860888, "grad_norm": 0.8670672178268433, "kl": 0.1598740518093109, "learning_rate": 2.558755024986842e-08, "loss": 0.0064, "reward": 2.5444374084472656, "reward_std": 0.6830737590789795, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4819374978542328, "step": 5377 }, { "completion_length": 140.03125, "epoch": 2.877474585339754, "grad_norm": 2.2608675956726074, "kl": 0.22995543479919434, "learning_rate": 2.536590972073011e-08, "loss": 0.0092, "reward": 2.307093858718872, "reward_std": 1.2025339603424072, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4320937693119049, "step": 5378 }, { "completion_length": 154.125, "epoch": 2.8780096308186196, "grad_norm": 1.3795166015625, "kl": 0.14041337370872498, "learning_rate": 2.5145228407185418e-08, "loss": 0.0056, "reward": 1.5648750066757202, "reward_std": 0.8574076890945435, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4242500066757202, "step": 5379 }, { "completion_length": 129.625, "epoch": 2.8785446762974853, "grad_norm": 1.193124771118164, "kl": 0.3102407157421112, "learning_rate": 2.4925506394774812e-08, "loss": 0.0124, "reward": 1.9551249742507935, "reward_std": 0.6479001045227051, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47075000405311584, "step": 5380 }, { "completion_length": 141.75, "epoch": 2.879079721776351, "grad_norm": 1.0480163097381592, "kl": 0.19527773559093475, "learning_rate": 2.4706743768667108e-08, "loss": 0.0078, "reward": 1.6504688262939453, "reward_std": 0.6583330631256104, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.47859376668930054, "step": 5381 }, { "completion_length": 143.875, "epoch": 2.879614767255217, "grad_norm": 1.206662654876709, "kl": 0.23074112832546234, "learning_rate": 2.4488940613658363e-08, "loss": 0.0092, "reward": 1.071062445640564, "reward_std": 0.587806224822998, "rewards/correctness_reward_func": 0.0625, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.28125, "rewards/xmlcount_reward_func": 0.38356250524520874, "step": 5382 }, { "completion_length": 126.5, "epoch": 2.8801498127340825, "grad_norm": 2.3707056045532227, "kl": 0.2241324782371521, "learning_rate": 2.4272097014173546e-08, "loss": 0.009, "reward": 2.6250624656677246, "reward_std": 0.6714376211166382, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.453187495470047, "step": 5383 }, { "completion_length": 120.59375, "epoch": 2.8806848582129483, "grad_norm": 0.9944695234298706, "kl": 0.2682332992553711, "learning_rate": 2.405621305426542e-08, "loss": 0.0107, "reward": 1.9207812547683716, "reward_std": 0.8263342380523682, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4364062547683716, "step": 5384 }, { "completion_length": 111.875, "epoch": 2.8812199036918136, "grad_norm": 0.7720736265182495, "kl": 0.19153419137001038, "learning_rate": 2.384128881761455e-08, "loss": 0.0077, "reward": 2.373718738555908, "reward_std": 0.4436507821083069, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4830937385559082, "step": 5385 }, { "completion_length": 129.6875, "epoch": 2.8817549491706798, "grad_norm": 5.5335845947265625, "kl": 1.741117000579834, "learning_rate": 2.362732438752985e-08, "loss": 0.0696, "reward": 1.9831249713897705, "reward_std": 0.8872338533401489, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4518750011920929, "step": 5386 }, { "completion_length": 142.8125, "epoch": 2.882289994649545, "grad_norm": 1.1485953330993652, "kl": 0.1786363571882248, "learning_rate": 2.3414319846948032e-08, "loss": 0.0071, "reward": 1.9411875009536743, "reward_std": 0.5848727226257324, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4724375009536743, "step": 5387 }, { "completion_length": 135.40625, "epoch": 2.882825040128411, "grad_norm": 0.8511391878128052, "kl": 0.21462325751781464, "learning_rate": 2.3202275278433884e-08, "loss": 0.0086, "reward": 2.34375, "reward_std": 0.2346404492855072, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.5, "step": 5388 }, { "completion_length": 135.84375, "epoch": 2.8833600856072765, "grad_norm": 0.6200993657112122, "kl": 0.17998072504997253, "learning_rate": 2.299119076417944e-08, "loss": 0.0072, "reward": 2.1860313415527344, "reward_std": 0.4971085488796234, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4829062521457672, "step": 5389 }, { "completion_length": 115.6875, "epoch": 2.8838951310861423, "grad_norm": 1.1098010540008545, "kl": 0.22088778018951416, "learning_rate": 2.2781066386005357e-08, "loss": 0.0088, "reward": 2.8926563262939453, "reward_std": 1.0178415775299072, "rewards/correctness_reward_func": 1.5625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47078123688697815, "step": 5390 }, { "completion_length": 137.84375, "epoch": 2.884430176565008, "grad_norm": 1.0151301622390747, "kl": 0.16986265778541565, "learning_rate": 2.2571902225360375e-08, "loss": 0.0068, "reward": 1.90234375, "reward_std": 0.9405554533004761, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.46484375, "step": 5391 }, { "completion_length": 115.09375, "epoch": 2.8849652220438737, "grad_norm": 2.0419740676879883, "kl": 0.17125552892684937, "learning_rate": 2.2363698363319643e-08, "loss": 0.0069, "reward": 2.6959376335144043, "reward_std": 0.6617028117179871, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.49281251430511475, "step": 5392 }, { "completion_length": 136.90625, "epoch": 2.8855002675227395, "grad_norm": 1.2316895723342896, "kl": 0.1674618124961853, "learning_rate": 2.2156454880587485e-08, "loss": 0.0067, "reward": 1.5887187719345093, "reward_std": 0.5870280861854553, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4324687421321869, "step": 5393 }, { "completion_length": 140.0, "epoch": 2.886035313001605, "grad_norm": 1.1513140201568604, "kl": 0.17101603746414185, "learning_rate": 2.1950171857495485e-08, "loss": 0.0068, "reward": 1.2672812938690186, "reward_std": 0.302482545375824, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4704062342643738, "step": 5394 }, { "completion_length": 105.46875, "epoch": 2.886570358480471, "grad_norm": 0.8818657994270325, "kl": 0.21191158890724182, "learning_rate": 2.174484937400273e-08, "loss": 0.0085, "reward": 2.8043437004089355, "reward_std": 0.6578583717346191, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4762187600135803, "step": 5395 }, { "completion_length": 123.46875, "epoch": 2.8871054039593367, "grad_norm": 0.5411727428436279, "kl": 0.17806410789489746, "learning_rate": 2.1540487509696394e-08, "loss": 0.0071, "reward": 2.421875, "reward_std": 0.7481576204299927, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.5, "step": 5396 }, { "completion_length": 128.53125, "epoch": 2.8876404494382024, "grad_norm": 2.4543564319610596, "kl": 0.29557907581329346, "learning_rate": 2.1337086343790615e-08, "loss": 0.0118, "reward": 2.1781249046325684, "reward_std": 0.6297355890274048, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4437499940395355, "step": 5397 }, { "completion_length": 146.8125, "epoch": 2.8881754949170677, "grad_norm": 1.053711175918579, "kl": 0.2079484760761261, "learning_rate": 2.1134645955128152e-08, "loss": 0.0083, "reward": 1.6860313415527344, "reward_std": 0.7413942813873291, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4360312521457672, "step": 5398 }, { "completion_length": 162.34375, "epoch": 2.888710540395934, "grad_norm": 0.9549646377563477, "kl": 0.2016160488128662, "learning_rate": 2.093316642217791e-08, "loss": 0.0081, "reward": 1.5796562433242798, "reward_std": 1.164670467376709, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.3765312433242798, "step": 5399 }, { "completion_length": 132.65625, "epoch": 2.889245585874799, "grad_norm": 0.6326959133148193, "kl": 0.14698198437690735, "learning_rate": 2.0732647823038242e-08, "loss": 0.0059, "reward": 2.36328125, "reward_std": 1.0531107187271118, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48828125, "step": 5400 }, { "completion_length": 121.625, "epoch": 2.889780631353665, "grad_norm": 1.0884064435958862, "kl": 0.17745953798294067, "learning_rate": 2.0533090235433372e-08, "loss": 0.0071, "reward": 1.984375, "reward_std": 0.6526868343353271, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.5, "step": 5401 }, { "completion_length": 137.875, "epoch": 2.8903156768325307, "grad_norm": 0.7171086072921753, "kl": 0.17019854485988617, "learning_rate": 2.0334493736715588e-08, "loss": 0.0068, "reward": 2.0409061908721924, "reward_std": 0.991710901260376, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.38465625047683716, "step": 5402 }, { "completion_length": 137.34375, "epoch": 2.8908507223113964, "grad_norm": 0.9507160782814026, "kl": 0.16550248861312866, "learning_rate": 2.0136858403865532e-08, "loss": 0.0066, "reward": 1.90625, "reward_std": 1.0508167743682861, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.453125, "step": 5403 }, { "completion_length": 135.65625, "epoch": 2.891385767790262, "grad_norm": 1.7908462285995483, "kl": 0.17911198735237122, "learning_rate": 1.994018431348915e-08, "loss": 0.0072, "reward": 2.4839999675750732, "reward_std": 1.3073344230651855, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.45274999737739563, "step": 5404 }, { "completion_length": 143.96875, "epoch": 2.891920813269128, "grad_norm": 1.1871747970581055, "kl": 0.2285444736480713, "learning_rate": 1.974447154182213e-08, "loss": 0.0091, "reward": 1.7480000257492065, "reward_std": 1.0262151956558228, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.40424996614456177, "step": 5405 }, { "completion_length": 102.34375, "epoch": 2.8924558587479936, "grad_norm": 2.273019313812256, "kl": 0.3864699900150299, "learning_rate": 1.9549720164726004e-08, "loss": 0.0155, "reward": 2.1690001487731934, "reward_std": 0.8639104962348938, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4815000295639038, "step": 5406 }, { "completion_length": 124.09375, "epoch": 2.8929909042268593, "grad_norm": 1.2579432725906372, "kl": 0.2858901917934418, "learning_rate": 1.9355930257690113e-08, "loss": 0.0114, "reward": 2.345937728881836, "reward_std": 0.8847034573554993, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.43968749046325684, "step": 5407 }, { "completion_length": 129.0, "epoch": 2.893525949705725, "grad_norm": 1.5598623752593994, "kl": 0.23536266386508942, "learning_rate": 1.9163101895831316e-08, "loss": 0.0094, "reward": 1.8558437824249268, "reward_std": 0.759148895740509, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.44959375262260437, "step": 5408 }, { "completion_length": 149.0, "epoch": 2.894060995184591, "grad_norm": 35303743488.0, "kl": 259334000.0, "learning_rate": 1.8971235153893708e-08, "loss": 10373360.0, "reward": 1.6522188186645508, "reward_std": 0.7589374780654907, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.464718759059906, "step": 5409 }, { "completion_length": 131.0625, "epoch": 2.8945960406634565, "grad_norm": 2.6334962844848633, "kl": 0.46751919388771057, "learning_rate": 1.878033010624808e-08, "loss": 0.0187, "reward": 1.6573437452316284, "reward_std": 0.40579766035079956, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4854687452316284, "step": 5410 }, { "completion_length": 125.78125, "epoch": 2.895131086142322, "grad_norm": 1.012715220451355, "kl": 0.18101927638053894, "learning_rate": 1.8590386826893293e-08, "loss": 0.0072, "reward": 1.7952499389648438, "reward_std": 0.9547819495201111, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4983749985694885, "step": 5411 }, { "completion_length": 120.53125, "epoch": 2.895666131621188, "grad_norm": 1.0632244348526, "kl": 0.2142658531665802, "learning_rate": 1.8401405389455175e-08, "loss": 0.0086, "reward": 1.7926561832427979, "reward_std": 0.9651241302490234, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4801562428474426, "step": 5412 }, { "completion_length": 132.875, "epoch": 2.8962011771000533, "grad_norm": 0.712640643119812, "kl": 0.20700804889202118, "learning_rate": 1.8213385867185684e-08, "loss": 0.0083, "reward": 2.488875150680542, "reward_std": 0.9857811331748962, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.47325003147125244, "step": 5413 }, { "completion_length": 107.5, "epoch": 2.896736222578919, "grad_norm": 1.45289945602417, "kl": 0.21717770397663116, "learning_rate": 1.802632833296569e-08, "loss": 0.0087, "reward": 2.78125, "reward_std": 0.6631065607070923, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.5, "step": 5414 }, { "completion_length": 141.65625, "epoch": 2.897271268057785, "grad_norm": 0.3121340274810791, "kl": 0.1242152526974678, "learning_rate": 1.784023285930192e-08, "loss": 0.005, "reward": 2.4526562690734863, "reward_std": 0.5830983519554138, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.48390626907348633, "step": 5415 }, { "completion_length": 123.4375, "epoch": 2.8978063135366505, "grad_norm": 1.322589635848999, "kl": 0.16063204407691956, "learning_rate": 1.765509951832861e-08, "loss": 0.0064, "reward": 2.120093822479248, "reward_std": 0.5060592293739319, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.47946876287460327, "step": 5416 }, { "completion_length": 130.875, "epoch": 2.8983413590155163, "grad_norm": 1.1882392168045044, "kl": 0.25856441259384155, "learning_rate": 1.7470928381806697e-08, "loss": 0.0103, "reward": 1.787406325340271, "reward_std": 0.767554521560669, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4592812657356262, "step": 5417 }, { "completion_length": 123.125, "epoch": 2.898876404494382, "grad_norm": 0.860924243927002, "kl": 0.1790909320116043, "learning_rate": 1.7287719521124903e-08, "loss": 0.0072, "reward": 2.383625030517578, "reward_std": 0.2561492621898651, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47737500071525574, "step": 5418 }, { "completion_length": 127.4375, "epoch": 2.8994114499732477, "grad_norm": 1.38906729221344, "kl": 0.20254364609718323, "learning_rate": 1.710547300729837e-08, "loss": 0.0081, "reward": 2.4579062461853027, "reward_std": 1.1505248546600342, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.44228124618530273, "step": 5419 }, { "completion_length": 111.375, "epoch": 2.8999464954521135, "grad_norm": 2.4237122535705566, "kl": 0.315729022026062, "learning_rate": 1.692418891096892e-08, "loss": 0.0126, "reward": 2.106874942779541, "reward_std": 0.42097947001457214, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4818750023841858, "step": 5420 }, { "completion_length": 130.4375, "epoch": 2.900481540930979, "grad_norm": 0.6721550226211548, "kl": 0.1642170250415802, "learning_rate": 1.6743867302406457e-08, "loss": 0.0066, "reward": 2.5316250324249268, "reward_std": 0.8754053711891174, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46912500262260437, "step": 5421 }, { "completion_length": 130.21875, "epoch": 2.901016586409845, "grad_norm": 1.464870810508728, "kl": 0.3050759434700012, "learning_rate": 1.656450825150646e-08, "loss": 0.0122, "reward": 2.365875005722046, "reward_std": 0.6536208391189575, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4752500057220459, "step": 5422 }, { "completion_length": 130.625, "epoch": 2.9015516318887107, "grad_norm": 1.7905653715133667, "kl": 0.2824031710624695, "learning_rate": 1.638611182779193e-08, "loss": 0.0113, "reward": 2.6229376792907715, "reward_std": 0.8040982484817505, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.45106250047683716, "step": 5423 }, { "completion_length": 127.46875, "epoch": 2.902086677367576, "grad_norm": 1.0677266120910645, "kl": 0.17739012837409973, "learning_rate": 1.6208678100413655e-08, "loss": 0.0071, "reward": 2.171562671661377, "reward_std": 0.9336273074150085, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4371874928474426, "step": 5424 }, { "completion_length": 142.09375, "epoch": 2.902621722846442, "grad_norm": 0.5879808068275452, "kl": 0.14699432253837585, "learning_rate": 1.6032207138147183e-08, "loss": 0.0059, "reward": 1.80859375, "reward_std": 0.3546842336654663, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.46484375, "step": 5425 }, { "completion_length": 132.03125, "epoch": 2.9031567683253074, "grad_norm": 1.2693843841552734, "kl": 0.16468334197998047, "learning_rate": 1.585669900939668e-08, "loss": 0.0066, "reward": 2.472531318664551, "reward_std": 0.7641353607177734, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.472531259059906, "step": 5426 }, { "completion_length": 139.6875, "epoch": 2.9036918138041736, "grad_norm": 23.83067512512207, "kl": 0.27183282375335693, "learning_rate": 1.5682153782192178e-08, "loss": 0.0109, "reward": 2.351437568664551, "reward_std": 0.849270224571228, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.476437509059906, "step": 5427 }, { "completion_length": 133.6875, "epoch": 2.904226859283039, "grad_norm": 69.14704895019531, "kl": 8.1078462600708, "learning_rate": 1.550857152419094e-08, "loss": 0.3243, "reward": 2.5415310859680176, "reward_std": 0.8291419744491577, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4477812647819519, "step": 5428 }, { "completion_length": 131.4375, "epoch": 2.9047619047619047, "grad_norm": 1.1389089822769165, "kl": 0.24354583024978638, "learning_rate": 1.5335952302676927e-08, "loss": 0.0097, "reward": 2.3209686279296875, "reward_std": 1.0372843742370605, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47721874713897705, "step": 5429 }, { "completion_length": 128.28125, "epoch": 2.9052969502407704, "grad_norm": 3.129473924636841, "kl": 0.16810953617095947, "learning_rate": 1.5164296184560222e-08, "loss": 0.0067, "reward": 1.72265625, "reward_std": 0.49290311336517334, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48828125, "step": 5430 }, { "completion_length": 147.46875, "epoch": 2.905831995719636, "grad_norm": 1.8270572423934937, "kl": 0.1744629144668579, "learning_rate": 1.499360323637844e-08, "loss": 0.007, "reward": 1.6100000143051147, "reward_std": 0.79468834400177, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.37562498450279236, "step": 5431 }, { "completion_length": 121.28125, "epoch": 2.906367041198502, "grad_norm": 1.0847288370132446, "kl": 0.2315935492515564, "learning_rate": 1.4823873524295041e-08, "loss": 0.0093, "reward": 1.3366563320159912, "reward_std": 0.6127877831459045, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.44603127241134644, "step": 5432 }, { "completion_length": 152.75, "epoch": 2.9069020866773676, "grad_norm": 11.2811861038208, "kl": 0.5360842943191528, "learning_rate": 1.4655107114101008e-08, "loss": 0.0214, "reward": 2.126687526702881, "reward_std": 0.7202819585800171, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.45481249690055847, "step": 5433 }, { "completion_length": 126.1875, "epoch": 2.9074371321562333, "grad_norm": 0.355453222990036, "kl": 0.19550873339176178, "learning_rate": 1.4487304071213182e-08, "loss": 0.0078, "reward": 2.586625099182129, "reward_std": 0.35051894187927246, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.41475000977516174, "step": 5434 }, { "completion_length": 103.4375, "epoch": 2.907972177635099, "grad_norm": 0.9244081377983093, "kl": 0.3306390345096588, "learning_rate": 1.432046446067481e-08, "loss": 0.0132, "reward": 2.7663750648498535, "reward_std": 0.8729056715965271, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46950000524520874, "step": 5435 }, { "completion_length": 124.84375, "epoch": 2.908507223113965, "grad_norm": 0.9286667108535767, "kl": 0.18738164007663727, "learning_rate": 1.415458834715694e-08, "loss": 0.0075, "reward": 1.9390625953674316, "reward_std": 0.8457573652267456, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4703124761581421, "step": 5436 }, { "completion_length": 153.46875, "epoch": 2.9090422685928305, "grad_norm": 5.7290120124816895, "kl": 0.34352701902389526, "learning_rate": 1.3989675794955637e-08, "loss": 0.0137, "reward": 1.2385624647140503, "reward_std": 0.7978450059890747, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.3948124945163727, "step": 5437 }, { "completion_length": 119.25, "epoch": 2.9095773140716963, "grad_norm": 0.8531530499458313, "kl": 0.19045038521289825, "learning_rate": 1.3825726867994493e-08, "loss": 0.0076, "reward": 2.278249979019165, "reward_std": 0.8747469186782837, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.4970000088214874, "step": 5438 }, { "completion_length": 118.8125, "epoch": 2.9101123595505616, "grad_norm": 0.6214468479156494, "kl": 0.19709432125091553, "learning_rate": 1.3662741629823506e-08, "loss": 0.0079, "reward": 2.4375, "reward_std": 0.6062352657318115, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.484375, "step": 5439 }, { "completion_length": 139.3125, "epoch": 2.9106474050294278, "grad_norm": 1.7443467378616333, "kl": 0.21018879115581512, "learning_rate": 1.3500720143618251e-08, "loss": 0.0084, "reward": 1.393843650817871, "reward_std": 0.7980276346206665, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.39384377002716064, "step": 5440 }, { "completion_length": 147.84375, "epoch": 2.911182450508293, "grad_norm": 1.9442546367645264, "kl": 0.18790775537490845, "learning_rate": 1.3339662472181824e-08, "loss": 0.0075, "reward": 1.4555000066757202, "reward_std": 0.770871639251709, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4398750066757202, "step": 5441 }, { "completion_length": 148.0, "epoch": 2.911717495987159, "grad_norm": 0.8967097997665405, "kl": 0.22075694799423218, "learning_rate": 1.3179568677943177e-08, "loss": 0.0088, "reward": 1.712249994277954, "reward_std": 1.0229814052581787, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4466249942779541, "step": 5442 }, { "completion_length": 157.03125, "epoch": 2.9122525414660245, "grad_norm": 1.3232812881469727, "kl": 0.16010555624961853, "learning_rate": 1.3020438822957671e-08, "loss": 0.0064, "reward": 1.6903125047683716, "reward_std": 1.1103445291519165, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.3934375047683716, "step": 5443 }, { "completion_length": 117.1875, "epoch": 2.9127875869448903, "grad_norm": 0.7136083245277405, "kl": 0.1704525202512741, "learning_rate": 1.2862272968907074e-08, "loss": 0.0068, "reward": 2.331906318664551, "reward_std": 0.6744850277900696, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.488156259059906, "step": 5444 }, { "completion_length": 118.21875, "epoch": 2.913322632423756, "grad_norm": 1.0058479309082031, "kl": 0.15766434371471405, "learning_rate": 1.270507117709957e-08, "loss": 0.0063, "reward": 2.302093744277954, "reward_std": 0.7692339420318604, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4739687442779541, "step": 5445 }, { "completion_length": 124.75, "epoch": 2.9138576779026217, "grad_norm": 2.039935827255249, "kl": 0.1392093002796173, "learning_rate": 1.2548833508469471e-08, "loss": 0.0056, "reward": 2.225937604904175, "reward_std": 1.161600112915039, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.44468748569488525, "step": 5446 }, { "completion_length": 117.3125, "epoch": 2.9143927233814875, "grad_norm": 1.308406114578247, "kl": 0.22973014414310455, "learning_rate": 1.23935600235775e-08, "loss": 0.0092, "reward": 2.2802813053131104, "reward_std": 0.8424721360206604, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46778127551078796, "step": 5447 }, { "completion_length": 125.375, "epoch": 2.914927768860353, "grad_norm": 0.7267407178878784, "kl": 0.15529721975326538, "learning_rate": 1.223925078261079e-08, "loss": 0.0062, "reward": 2.5897812843322754, "reward_std": 0.7453465461730957, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.496031254529953, "step": 5448 }, { "completion_length": 98.53125, "epoch": 2.915462814339219, "grad_norm": 0.7098506093025208, "kl": 0.19907571375370026, "learning_rate": 1.2085905845382605e-08, "loss": 0.008, "reward": 2.9058125019073486, "reward_std": 0.6480016112327576, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.49956250190734863, "step": 5449 }, { "completion_length": 119.125, "epoch": 2.9159978598180847, "grad_norm": 0.9884857535362244, "kl": 0.22187268733978271, "learning_rate": 1.193352527133207e-08, "loss": 0.0089, "reward": 2.71875, "reward_std": 0.8640725016593933, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.484375, "step": 5450 }, { "completion_length": 153.34375, "epoch": 2.9165329052969504, "grad_norm": 0.920061469078064, "kl": 0.11577959358692169, "learning_rate": 1.1782109119524987e-08, "loss": 0.0046, "reward": 1.3341875076293945, "reward_std": 0.6624643802642822, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.41231247782707214, "step": 5451 }, { "completion_length": 146.59375, "epoch": 2.9170679507758157, "grad_norm": 1.4059590101242065, "kl": 0.16987772285938263, "learning_rate": 1.1631657448653577e-08, "loss": 0.0068, "reward": 1.546875, "reward_std": 0.6237328052520752, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.453125, "step": 5452 }, { "completion_length": 137.4375, "epoch": 2.917602996254682, "grad_norm": 0.607192873954773, "kl": 0.17756134271621704, "learning_rate": 1.1482170317034802e-08, "loss": 0.0071, "reward": 1.7399375438690186, "reward_std": 0.7991956472396851, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.44306251406669617, "step": 5453 }, { "completion_length": 113.78125, "epoch": 2.918138041733547, "grad_norm": 0.6749945282936096, "kl": 0.21018874645233154, "learning_rate": 1.1333647782613698e-08, "loss": 0.0084, "reward": 1.75390625, "reward_std": 0.6229214072227478, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.48828125, "step": 5454 }, { "completion_length": 142.28125, "epoch": 2.918673087212413, "grad_norm": 1.3167579174041748, "kl": 0.16105470061302185, "learning_rate": 1.1186089902960052e-08, "loss": 0.0064, "reward": 2.0052812099456787, "reward_std": 0.9033689498901367, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4740312397480011, "step": 5455 }, { "completion_length": 109.78125, "epoch": 2.9192081326912787, "grad_norm": 1.5805314779281616, "kl": 0.35748782753944397, "learning_rate": 1.1039496735269772e-08, "loss": 0.0143, "reward": 2.4146876335144043, "reward_std": 0.7726372480392456, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.49281251430511475, "step": 5456 }, { "completion_length": 121.5625, "epoch": 2.9197431781701444, "grad_norm": 79.48588562011719, "kl": 1.044204592704773, "learning_rate": 1.089386833636602e-08, "loss": 0.0418, "reward": 1.7341251373291016, "reward_std": 0.5303187370300293, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.499750018119812, "step": 5457 }, { "completion_length": 119.84375, "epoch": 2.92027822364901, "grad_norm": 0.746509313583374, "kl": 0.1798711121082306, "learning_rate": 1.0749204762696419e-08, "loss": 0.0072, "reward": 1.65625, "reward_std": 0.6499056220054626, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.5, "step": 5458 }, { "completion_length": 137.65625, "epoch": 2.920813269127876, "grad_norm": 11.869758605957031, "kl": 0.21277493238449097, "learning_rate": 1.0605506070335558e-08, "loss": 0.0085, "reward": 1.90625, "reward_std": 0.4760287404060364, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.5, "step": 5459 }, { "completion_length": 124.84375, "epoch": 2.9213483146067416, "grad_norm": 1.00895094871521, "kl": 0.22719264030456543, "learning_rate": 1.0462772314983881e-08, "loss": 0.0091, "reward": 2.6454687118530273, "reward_std": 0.6938403844833374, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48921874165534973, "step": 5460 }, { "completion_length": 146.34375, "epoch": 2.9218833600856073, "grad_norm": 1.0667724609375, "kl": 0.17227379977703094, "learning_rate": 1.032100355196769e-08, "loss": 0.0069, "reward": 1.8930937051773071, "reward_std": 0.997506856918335, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4399687647819519, "step": 5461 }, { "completion_length": 147.96875, "epoch": 2.922418405564473, "grad_norm": 1.4458088874816895, "kl": 0.1784130036830902, "learning_rate": 1.0180199836239413e-08, "loss": 0.0071, "reward": 1.5293124914169312, "reward_std": 0.5129857063293457, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48243749141693115, "step": 5462 }, { "completion_length": 122.03125, "epoch": 2.922953451043339, "grad_norm": 0.905931293964386, "kl": 0.2060997486114502, "learning_rate": 1.0040361222377059e-08, "loss": 0.0082, "reward": 1.4379374980926514, "reward_std": 0.550442099571228, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.45356249809265137, "step": 5463 }, { "completion_length": 134.375, "epoch": 2.9234884965222045, "grad_norm": 9.697265625, "kl": 3.2201640605926514, "learning_rate": 9.901487764584772e-09, "loss": 0.1288, "reward": 2.013906240463257, "reward_std": 0.7576062679290771, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.45140624046325684, "step": 5464 }, { "completion_length": 130.8125, "epoch": 2.92402354200107, "grad_norm": 0.5345951914787292, "kl": 0.18760105967521667, "learning_rate": 9.763579516692823e-09, "loss": 0.0075, "reward": 2.086124897003174, "reward_std": 0.8242332935333252, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4767500162124634, "step": 5465 }, { "completion_length": 133.78125, "epoch": 2.924558587479936, "grad_norm": 0.8108288645744324, "kl": 0.17671449482440948, "learning_rate": 9.626636532156508e-09, "loss": 0.0071, "reward": 2.402218818664551, "reward_std": 0.644158124923706, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.464718759059906, "step": 5466 }, { "completion_length": 118.5, "epoch": 2.9250936329588013, "grad_norm": 1.6942198276519775, "kl": 0.20835062861442566, "learning_rate": 9.490658864058366e-09, "loss": 0.0083, "reward": 1.8117499351501465, "reward_std": 0.5857141613960266, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46799999475479126, "step": 5467 }, { "completion_length": 126.25, "epoch": 2.925628678437667, "grad_norm": 1.6567704677581787, "kl": 0.24200429022312164, "learning_rate": 9.355646565105403e-09, "loss": 0.0097, "reward": 2.498500108718872, "reward_std": 0.844860851764679, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4828749895095825, "step": 5468 }, { "completion_length": 121.3125, "epoch": 2.926163723916533, "grad_norm": 1.1247081756591797, "kl": 0.38435718417167664, "learning_rate": 9.221599687630756e-09, "loss": 0.0154, "reward": 2.4263436794281006, "reward_std": 0.9103129506111145, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.42634373903274536, "step": 5469 }, { "completion_length": 123.28125, "epoch": 2.9266987693953985, "grad_norm": 2.729572057723999, "kl": 0.26553088426589966, "learning_rate": 9.08851828359425e-09, "loss": 0.0106, "reward": 2.1000938415527344, "reward_std": 0.6435680389404297, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4594687521457672, "step": 5470 }, { "completion_length": 133.1875, "epoch": 2.9272338148742643, "grad_norm": 0.5731984972953796, "kl": 0.21160301566123962, "learning_rate": 8.956402404580179e-09, "loss": 0.0085, "reward": 1.4140625, "reward_std": 0.5459476113319397, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4765625, "step": 5471 }, { "completion_length": 140.96875, "epoch": 2.92776886035313, "grad_norm": 97538.6171875, "kl": 509.61895751953125, "learning_rate": 8.825252101799797e-09, "loss": 20.3848, "reward": 1.8094375133514404, "reward_std": 0.9895426034927368, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.43443751335144043, "step": 5472 }, { "completion_length": 154.6875, "epoch": 2.9283039058319957, "grad_norm": 1.3377536535263062, "kl": 0.31580018997192383, "learning_rate": 8.695067426088833e-09, "loss": 0.0126, "reward": 1.6169687509536743, "reward_std": 0.8461573719978333, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.3982187509536743, "step": 5473 }, { "completion_length": 159.78125, "epoch": 2.9288389513108615, "grad_norm": 0.7497572302818298, "kl": 0.167704775929451, "learning_rate": 8.565848427909696e-09, "loss": 0.0067, "reward": 1.7049062252044678, "reward_std": 1.0600370168685913, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.43928125500679016, "step": 5474 }, { "completion_length": 134.09375, "epoch": 2.929373996789727, "grad_norm": 0.7203963398933411, "kl": 0.2382964789867401, "learning_rate": 8.437595157350099e-09, "loss": 0.0095, "reward": 1.6885312795639038, "reward_std": 0.4734489619731903, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4697812497615814, "step": 5475 }, { "completion_length": 135.5, "epoch": 2.929909042268593, "grad_norm": 0.6366084218025208, "kl": 0.16570672392845154, "learning_rate": 8.310307664123607e-09, "loss": 0.0066, "reward": 1.850656270980835, "reward_std": 0.4809582829475403, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4444062411785126, "step": 5476 }, { "completion_length": 131.9375, "epoch": 2.9304440877474587, "grad_norm": 0.9153140187263489, "kl": 0.15904250741004944, "learning_rate": 8.183985997569088e-09, "loss": 0.0064, "reward": 2.2968125343322754, "reward_std": 0.8572794795036316, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.499937504529953, "step": 5477 }, { "completion_length": 111.78125, "epoch": 2.930979133226324, "grad_norm": 1.905992031097412, "kl": 0.3184286952018738, "learning_rate": 8.058630206650986e-09, "loss": 0.0127, "reward": 2.535031318664551, "reward_std": 0.9560350179672241, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.472531259059906, "step": 5478 }, { "completion_length": 130.34375, "epoch": 2.93151417870519, "grad_norm": 1.7315218448638916, "kl": 0.16709867119789124, "learning_rate": 7.934240339960431e-09, "loss": 0.0067, "reward": 2.2421875, "reward_std": 0.7358886003494263, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4765625, "step": 5479 }, { "completion_length": 134.375, "epoch": 2.9320492241840554, "grad_norm": 2.2678754329681396, "kl": 0.18764331936836243, "learning_rate": 7.810816445712466e-09, "loss": 0.0075, "reward": 1.8890000581741333, "reward_std": 0.9480136632919312, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4671249985694885, "step": 5480 }, { "completion_length": 123.78125, "epoch": 2.932584269662921, "grad_norm": 0.8403711318969727, "kl": 0.18177542090415955, "learning_rate": 7.688358571748822e-09, "loss": 0.0073, "reward": 1.6541249752044678, "reward_std": 0.543739914894104, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4666249752044678, "step": 5481 }, { "completion_length": 103.96875, "epoch": 2.933119315141787, "grad_norm": 4.436728000640869, "kl": 0.4342973232269287, "learning_rate": 7.566866765536807e-09, "loss": 0.0174, "reward": 1.7121875286102295, "reward_std": 0.8291865587234497, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4934374690055847, "step": 5482 }, { "completion_length": 123.5, "epoch": 2.9336543606206527, "grad_norm": 0.8444511890411377, "kl": 0.19159093499183655, "learning_rate": 7.4463410741687506e-09, "loss": 0.0077, "reward": 2.45703125, "reward_std": 0.8216465711593628, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.45703125, "step": 5483 }, { "completion_length": 134.40625, "epoch": 2.9341894060995184, "grad_norm": 1.205899715423584, "kl": 0.2579284906387329, "learning_rate": 7.326781544362838e-09, "loss": 0.0103, "reward": 1.657156229019165, "reward_std": 0.8313079476356506, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4696562588214874, "step": 5484 }, { "completion_length": 138.5, "epoch": 2.934724451578384, "grad_norm": 0.9375576376914978, "kl": 0.1358068436384201, "learning_rate": 7.208188222462276e-09, "loss": 0.0054, "reward": 1.859281301498413, "reward_std": 0.7244734168052673, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4374062716960907, "step": 5485 }, { "completion_length": 150.46875, "epoch": 2.93525949705725, "grad_norm": 1.8915276527404785, "kl": 0.18931245803833008, "learning_rate": 7.090561154436681e-09, "loss": 0.0076, "reward": 1.47265625, "reward_std": 0.7256392240524292, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.44140625, "step": 5486 }, { "completion_length": 123.84375, "epoch": 2.9357945425361156, "grad_norm": 0.9748982787132263, "kl": 0.17482778429985046, "learning_rate": 6.973900385880139e-09, "loss": 0.007, "reward": 2.3007187843322754, "reward_std": 0.951909065246582, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.456968754529953, "step": 5487 }, { "completion_length": 143.65625, "epoch": 2.9363295880149813, "grad_norm": 0.439736008644104, "kl": 0.15545867383480072, "learning_rate": 6.858205962012588e-09, "loss": 0.0062, "reward": 1.868687391281128, "reward_std": 0.4820323586463928, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4311875104904175, "step": 5488 }, { "completion_length": 121.125, "epoch": 2.936864633493847, "grad_norm": 0.7698597311973572, "kl": 0.1765059381723404, "learning_rate": 6.7434779276795425e-09, "loss": 0.0071, "reward": 2.71875, "reward_std": 0.7638771533966064, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.5, "step": 5489 }, { "completion_length": 127.71875, "epoch": 2.937399678972713, "grad_norm": 0.7597352862358093, "kl": 0.181086927652359, "learning_rate": 6.629716327351821e-09, "loss": 0.0072, "reward": 2.737968683242798, "reward_std": 0.5684470534324646, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4879687428474426, "step": 5490 }, { "completion_length": 115.6875, "epoch": 2.9379347244515785, "grad_norm": 0.5579742789268494, "kl": 0.17570897936820984, "learning_rate": 6.516921205125537e-09, "loss": 0.007, "reward": 2.328249931335449, "reward_std": 0.30807632207870483, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.468874990940094, "step": 5491 }, { "completion_length": 141.5, "epoch": 2.9384697699304443, "grad_norm": 1.0088474750518799, "kl": 0.15683777630329132, "learning_rate": 6.405092604722108e-09, "loss": 0.0063, "reward": 1.7567812204360962, "reward_std": 0.9016531705856323, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4755312502384186, "step": 5492 }, { "completion_length": 118.40625, "epoch": 2.9390048154093096, "grad_norm": 1.0471482276916504, "kl": 0.17295503616333008, "learning_rate": 6.294230569488802e-09, "loss": 0.0069, "reward": 2.218625068664551, "reward_std": 0.7227970361709595, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.499875009059906, "step": 5493 }, { "completion_length": 114.25, "epoch": 2.9395398608881758, "grad_norm": 1.339530110359192, "kl": 0.2828322649002075, "learning_rate": 6.184335142397358e-09, "loss": 0.0113, "reward": 1.7285938262939453, "reward_std": 0.47742629051208496, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.47859373688697815, "step": 5494 }, { "completion_length": 135.90625, "epoch": 2.940074906367041, "grad_norm": 0.7731396555900574, "kl": 0.21727964282035828, "learning_rate": 6.075406366045922e-09, "loss": 0.0087, "reward": 1.9954687356948853, "reward_std": 0.36636969447135925, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.47984373569488525, "step": 5495 }, { "completion_length": 159.78125, "epoch": 2.940609951845907, "grad_norm": 0.8293255567550659, "kl": 0.14934495091438293, "learning_rate": 5.967444282656832e-09, "loss": 0.006, "reward": 1.78125, "reward_std": 1.2369621992111206, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.40625, "step": 5496 }, { "completion_length": 141.15625, "epoch": 2.9411449973247725, "grad_norm": 0.6472139954566956, "kl": 0.20561768114566803, "learning_rate": 5.860448934078833e-09, "loss": 0.0082, "reward": 1.6599687337875366, "reward_std": 0.8701874017715454, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4568437337875366, "step": 5497 }, { "completion_length": 165.9375, "epoch": 2.9416800428036383, "grad_norm": 0.653958797454834, "kl": 0.13889530301094055, "learning_rate": 5.754420361784863e-09, "loss": 0.0056, "reward": 1.4463436603546143, "reward_std": 0.8649629950523376, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.28125, "rewards/xmlcount_reward_func": 0.3682187497615814, "step": 5498 }, { "completion_length": 130.75, "epoch": 2.942215088282504, "grad_norm": 0.9588497281074524, "kl": 0.21126317977905273, "learning_rate": 5.649358606873989e-09, "loss": 0.0085, "reward": 2.316281318664551, "reward_std": 0.6261669397354126, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.488156259059906, "step": 5499 }, { "completion_length": 135.84375, "epoch": 2.9427501337613697, "grad_norm": 0.6528149843215942, "kl": 0.16572478413581848, "learning_rate": 5.545263710069748e-09, "loss": 0.0066, "reward": 1.5602812767028809, "reward_std": 0.7982991337776184, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.48215624690055847, "step": 5500 }, { "completion_length": 119.34375, "epoch": 2.9432851792402355, "grad_norm": 0.9774942398071289, "kl": 0.18458710610866547, "learning_rate": 5.442135711721808e-09, "loss": 0.0074, "reward": 2.4057188034057617, "reward_std": 0.689332127571106, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.48384374380111694, "step": 5501 }, { "completion_length": 125.0625, "epoch": 2.943820224719101, "grad_norm": 1.1238820552825928, "kl": 0.22071923315525055, "learning_rate": 5.339974651804025e-09, "loss": 0.0088, "reward": 2.2078752517700195, "reward_std": 0.9656259417533875, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.47350001335144043, "step": 5502 }, { "completion_length": 123.75, "epoch": 2.944355270197967, "grad_norm": 0.846865713596344, "kl": 0.2647157311439514, "learning_rate": 5.238780569916391e-09, "loss": 0.0106, "reward": 2.004500150680542, "reward_std": 0.8797187209129333, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.45762500166893005, "step": 5503 }, { "completion_length": 129.6875, "epoch": 2.9448903156768327, "grad_norm": 1.2285524606704712, "kl": 0.20177757740020752, "learning_rate": 5.138553505283639e-09, "loss": 0.0081, "reward": 2.5220625400543213, "reward_std": 0.7103092670440674, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4751874804496765, "step": 5504 }, { "completion_length": 124.21875, "epoch": 2.9454253611556984, "grad_norm": 1.4640905857086182, "kl": 0.17406442761421204, "learning_rate": 5.039293496755526e-09, "loss": 0.007, "reward": 1.8488750457763672, "reward_std": 0.7943956255912781, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.4113750159740448, "step": 5505 }, { "completion_length": 141.625, "epoch": 2.9459604066345637, "grad_norm": 1.1338348388671875, "kl": 0.14328473806381226, "learning_rate": 4.941000582807387e-09, "loss": 0.0057, "reward": 1.4601562023162842, "reward_std": 0.6957026720046997, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.44453126192092896, "step": 5506 }, { "completion_length": 127.0, "epoch": 2.94649545211343, "grad_norm": 0.906087338924408, "kl": 0.19059540331363678, "learning_rate": 4.8436748015390224e-09, "loss": 0.0076, "reward": 1.835531234741211, "reward_std": 0.7593738436698914, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.47615623474121094, "step": 5507 }, { "completion_length": 131.375, "epoch": 2.947030497592295, "grad_norm": 1.641752004623413, "kl": 0.17426231503486633, "learning_rate": 4.747316190676365e-09, "loss": 0.007, "reward": 1.683843731880188, "reward_std": 0.724204957485199, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.465093731880188, "step": 5508 }, { "completion_length": 113.5625, "epoch": 2.947565543071161, "grad_norm": 1.4277325868606567, "kl": 0.3207303285598755, "learning_rate": 4.651924787569262e-09, "loss": 0.0128, "reward": 2.1718125343322754, "reward_std": 0.602485179901123, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.468687504529953, "step": 5509 }, { "completion_length": 138.0, "epoch": 2.9481005885500267, "grad_norm": 1.3697432279586792, "kl": 0.1671488881111145, "learning_rate": 4.557500629193412e-09, "loss": 0.0067, "reward": 2.1011250019073486, "reward_std": 0.8574323058128357, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.49175000190734863, "step": 5510 }, { "completion_length": 124.0625, "epoch": 2.9486356340288924, "grad_norm": 3.2197303771972656, "kl": 0.35532528162002563, "learning_rate": 4.464043752149816e-09, "loss": 0.0142, "reward": 2.0412187576293945, "reward_std": 0.8702370524406433, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47871875762939453, "step": 5511 }, { "completion_length": 129.59375, "epoch": 2.949170679507758, "grad_norm": 1.30644953250885, "kl": 0.17887064814567566, "learning_rate": 4.371554192663663e-09, "loss": 0.0072, "reward": 1.4652812480926514, "reward_std": 0.6263561844825745, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.44965624809265137, "step": 5512 }, { "completion_length": 127.21875, "epoch": 2.949705724986624, "grad_norm": 1.3481279611587524, "kl": 0.22026373445987701, "learning_rate": 4.28003198658572e-09, "loss": 0.0088, "reward": 2.40596866607666, "reward_std": 1.0859798192977905, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4684687554836273, "step": 5513 }, { "completion_length": 130.125, "epoch": 2.9502407704654896, "grad_norm": 275.8110656738281, "kl": 4.737254619598389, "learning_rate": 4.1894771693920536e-09, "loss": 0.1895, "reward": 2.2040624618530273, "reward_std": 1.1796905994415283, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.43843749165534973, "step": 5514 }, { "completion_length": 126.3125, "epoch": 2.9507758159443553, "grad_norm": 0.39669182896614075, "kl": 0.13469210267066956, "learning_rate": 4.099889776183474e-09, "loss": 0.0054, "reward": 2.6091251373291016, "reward_std": 0.39199721813201904, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.484125018119812, "step": 5515 }, { "completion_length": 114.875, "epoch": 2.951310861423221, "grad_norm": 0.7815136909484863, "kl": 0.1976003348827362, "learning_rate": 4.011269841685261e-09, "loss": 0.0079, "reward": 3.0070624351501465, "reward_std": 0.8122972249984741, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47581249475479126, "step": 5516 }, { "completion_length": 122.4375, "epoch": 2.951845906902087, "grad_norm": 1.5725114345550537, "kl": 0.22813545167446136, "learning_rate": 3.923617400248825e-09, "loss": 0.0091, "reward": 2.0317811965942383, "reward_std": 0.6622037291526794, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46928125619888306, "step": 5517 }, { "completion_length": 144.0625, "epoch": 2.9523809523809526, "grad_norm": 1.5378906726837158, "kl": 0.17580106854438782, "learning_rate": 3.836932485849487e-09, "loss": 0.007, "reward": 2.208031177520752, "reward_std": 0.718787431716919, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.45803123712539673, "step": 5518 }, { "completion_length": 124.9375, "epoch": 2.952915997859818, "grad_norm": 2.594897985458374, "kl": 0.3650338053703308, "learning_rate": 3.751215132088148e-09, "loss": 0.0146, "reward": 2.2994375228881836, "reward_std": 1.054514765739441, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4869374930858612, "step": 5519 }, { "completion_length": 135.09375, "epoch": 2.953451043338684, "grad_norm": 1.3558987379074097, "kl": 0.14232827723026276, "learning_rate": 3.666465372190453e-09, "loss": 0.0057, "reward": 2.3499374389648438, "reward_std": 0.7884624600410461, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4749374985694885, "step": 5520 }, { "completion_length": 157.0625, "epoch": 2.9539860888175493, "grad_norm": 0.8201988935470581, "kl": 0.1521892249584198, "learning_rate": 3.5826832390070675e-09, "loss": 0.0061, "reward": 2.1527812480926514, "reward_std": 0.864560604095459, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.37153127789497375, "step": 5521 }, { "completion_length": 156.4375, "epoch": 2.954521134296415, "grad_norm": 1.994701623916626, "kl": 0.20958289504051208, "learning_rate": 3.4998687650134032e-09, "loss": 0.0084, "reward": 1.8099687099456787, "reward_std": 0.6473077535629272, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.3880937695503235, "step": 5522 }, { "completion_length": 131.25, "epoch": 2.955056179775281, "grad_norm": 0.504155158996582, "kl": 0.19892950356006622, "learning_rate": 3.4180219823101714e-09, "loss": 0.008, "reward": 2.549968719482422, "reward_std": 0.6354308128356934, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.48746874928474426, "step": 5523 }, { "completion_length": 119.46875, "epoch": 2.9555912252541465, "grad_norm": 0.958141028881073, "kl": 0.17637425661087036, "learning_rate": 3.3371429226225493e-09, "loss": 0.0071, "reward": 2.8591251373291016, "reward_std": 0.24307149648666382, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.499750018119812, "step": 5524 }, { "completion_length": 127.0625, "epoch": 2.9561262707330123, "grad_norm": 485332736.0, "kl": 879123.8125, "learning_rate": 3.257231617301293e-09, "loss": 35164.9531, "reward": 2.6531875133514404, "reward_std": 1.0828392505645752, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.46568751335144043, "step": 5525 }, { "completion_length": 140.40625, "epoch": 2.956661316211878, "grad_norm": 0.8068022131919861, "kl": 0.1294172704219818, "learning_rate": 3.1782880973207918e-09, "loss": 0.0052, "reward": 2.3125, "reward_std": 0.6822727918624878, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.453125, "step": 5526 }, { "completion_length": 137.875, "epoch": 2.9571963616907437, "grad_norm": 0.6345424652099609, "kl": 0.16227175295352936, "learning_rate": 3.1003123932815683e-09, "loss": 0.0065, "reward": 1.8939061164855957, "reward_std": 0.9974024295806885, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.44078123569488525, "step": 5527 }, { "completion_length": 99.5, "epoch": 2.9577314071696095, "grad_norm": 0.8871913552284241, "kl": 0.2668052315711975, "learning_rate": 3.0233045354083333e-09, "loss": 0.0107, "reward": 2.7128124237060547, "reward_std": 0.5788823962211609, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.49406248331069946, "step": 5528 }, { "completion_length": 146.3125, "epoch": 2.958266452648475, "grad_norm": 0.9598188400268555, "kl": 0.24972352385520935, "learning_rate": 2.947264553551099e-09, "loss": 0.01, "reward": 1.8984375, "reward_std": 0.9456361532211304, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4765625, "step": 5529 }, { "completion_length": 116.5625, "epoch": 2.958801498127341, "grad_norm": 0.7346943020820618, "kl": 0.17452885210514069, "learning_rate": 2.872192477184066e-09, "loss": 0.007, "reward": 2.4038748741149902, "reward_std": 0.7609562873840332, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4819999933242798, "step": 5530 }, { "completion_length": 137.125, "epoch": 2.9593365436062067, "grad_norm": 14.471835136413574, "kl": 3.602130889892578, "learning_rate": 2.798088335406457e-09, "loss": 0.1441, "reward": 2.011312484741211, "reward_std": 0.9698576331138611, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.43318748474121094, "step": 5531 }, { "completion_length": 135.1875, "epoch": 2.959871589085072, "grad_norm": 1.280145525932312, "kl": 0.3521474003791809, "learning_rate": 2.7249521569430725e-09, "loss": 0.0141, "reward": 2.0464375019073486, "reward_std": 0.7699291110038757, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.45268750190734863, "step": 5532 }, { "completion_length": 117.59375, "epoch": 2.960406634563938, "grad_norm": 0.8396698832511902, "kl": 0.2588388919830322, "learning_rate": 2.6527839701423476e-09, "loss": 0.0104, "reward": 2.1085000038146973, "reward_std": 0.8050386905670166, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.48350000381469727, "step": 5533 }, { "completion_length": 115.28125, "epoch": 2.9609416800428034, "grad_norm": 0.8701934218406677, "kl": 0.22554785013198853, "learning_rate": 2.5815838029782936e-09, "loss": 0.009, "reward": 2.7875938415527344, "reward_std": 1.0406010150909424, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4907187521457672, "step": 5534 }, { "completion_length": 148.15625, "epoch": 2.961476725521669, "grad_norm": 0.7150408625602722, "kl": 0.1674654185771942, "learning_rate": 2.5113516830493902e-09, "loss": 0.0067, "reward": 1.5906562805175781, "reward_std": 0.7326673865318298, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.46565625071525574, "step": 5535 }, { "completion_length": 109.6875, "epoch": 2.962011771000535, "grad_norm": 1.2497411966323853, "kl": 0.21372926235198975, "learning_rate": 2.442087637579138e-09, "loss": 0.0085, "reward": 2.7375001907348633, "reward_std": 0.6107788681983948, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.47187501192092896, "step": 5536 }, { "completion_length": 149.4375, "epoch": 2.9625468164794007, "grad_norm": 2.9069902896881104, "kl": 0.23636528849601746, "learning_rate": 2.3737916934152284e-09, "loss": 0.0095, "reward": 2.011218786239624, "reward_std": 0.878642201423645, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.44871872663497925, "step": 5537 }, { "completion_length": 131.0625, "epoch": 2.9630818619582664, "grad_norm": 2.012855052947998, "kl": 0.2100065052509308, "learning_rate": 2.306463877030929e-09, "loss": 0.0084, "reward": 1.9707813262939453, "reward_std": 0.6712232828140259, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.47078126668930054, "step": 5538 }, { "completion_length": 131.875, "epoch": 2.963616907437132, "grad_norm": 0.8661389350891113, "kl": 0.19591450691223145, "learning_rate": 2.24010421452342e-09, "loss": 0.0078, "reward": 1.9252500534057617, "reward_std": 0.44131308794021606, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.47212499380111694, "step": 5539 }, { "completion_length": 122.28125, "epoch": 2.964151952915998, "grad_norm": 1.9167144298553467, "kl": 0.2592325806617737, "learning_rate": 2.1747127316151808e-09, "loss": 0.0104, "reward": 1.9575937986373901, "reward_std": 0.5448204874992371, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.48884376883506775, "step": 5540 }, { "completion_length": 134.4375, "epoch": 2.9646869983948636, "grad_norm": 0.8145410418510437, "kl": 0.19245707988739014, "learning_rate": 2.1102894536531584e-09, "loss": 0.0077, "reward": 1.8927812576293945, "reward_std": 0.7504684925079346, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.45528125762939453, "step": 5541 }, { "completion_length": 140.0625, "epoch": 2.9652220438737293, "grad_norm": 0.782307505607605, "kl": 0.22425711154937744, "learning_rate": 2.0468344056090437e-09, "loss": 0.009, "reward": 1.9476875066757202, "reward_std": 0.721000075340271, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4320625066757202, "step": 5542 }, { "completion_length": 134.84375, "epoch": 2.965757089352595, "grad_norm": 1.6895112991333008, "kl": 0.16538506746292114, "learning_rate": 1.9843476120792735e-09, "loss": 0.0066, "reward": 2.3472812175750732, "reward_std": 0.9444855451583862, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.48790624737739563, "step": 5543 }, { "completion_length": 139.34375, "epoch": 2.966292134831461, "grad_norm": 0.9544693827629089, "kl": 0.17834815382957458, "learning_rate": 1.922829097284751e-09, "loss": 0.0071, "reward": 2.140500068664551, "reward_std": 0.6512764692306519, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.484250009059906, "step": 5544 }, { "completion_length": 133.75, "epoch": 2.966827180310326, "grad_norm": 0.9163661599159241, "kl": 0.14861929416656494, "learning_rate": 1.8622788850714002e-09, "loss": 0.0059, "reward": 2.052093744277954, "reward_std": 0.8377828001976013, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4583437442779541, "step": 5545 }, { "completion_length": 113.53125, "epoch": 2.9673622257891923, "grad_norm": 1.3096848726272583, "kl": 0.19150410592556, "learning_rate": 1.8026969989096143e-09, "loss": 0.0077, "reward": 1.8996250629425049, "reward_std": 0.7826639413833618, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4777500033378601, "step": 5546 }, { "completion_length": 138.0625, "epoch": 2.9678972712680576, "grad_norm": 0.9495174288749695, "kl": 0.1432569921016693, "learning_rate": 1.7440834618945302e-09, "loss": 0.0057, "reward": 1.55287504196167, "reward_std": 0.8376568555831909, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.45912501215934753, "step": 5547 }, { "completion_length": 121.4375, "epoch": 2.9684323167469238, "grad_norm": 1.0210199356079102, "kl": 0.1697228103876114, "learning_rate": 1.6864382967457516e-09, "loss": 0.0068, "reward": 1.8996250629425049, "reward_std": 0.760195255279541, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4777500033378601, "step": 5548 }, { "completion_length": 131.84375, "epoch": 2.968967362225789, "grad_norm": 0.6002065539360046, "kl": 0.22929595410823822, "learning_rate": 1.6297615258076271e-09, "loss": 0.0092, "reward": 1.7669687271118164, "reward_std": 0.7401952147483826, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4857187569141388, "step": 5549 }, { "completion_length": 164.09375, "epoch": 2.969502407704655, "grad_norm": 2.0433943271636963, "kl": 0.32245808839797974, "learning_rate": 1.57405317104925e-09, "loss": 0.0129, "reward": 1.173031210899353, "reward_std": 0.8448989391326904, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.3917812407016754, "step": 5550 }, { "completion_length": 144.0, "epoch": 2.9700374531835205, "grad_norm": 0.8841089606285095, "kl": 0.1721423864364624, "learning_rate": 1.519313254064181e-09, "loss": 0.0069, "reward": 1.7890625, "reward_std": 0.3184587359428406, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4765625, "step": 5551 }, { "completion_length": 136.96875, "epoch": 2.9705724986623863, "grad_norm": 0.8928319215774536, "kl": 0.16887298226356506, "learning_rate": 1.4655417960710017e-09, "loss": 0.0068, "reward": 1.6168749332427979, "reward_std": 0.7255005240440369, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4606249928474426, "step": 5552 }, { "completion_length": 128.09375, "epoch": 2.971107544141252, "grad_norm": 2.1871466636657715, "kl": 0.270528107881546, "learning_rate": 1.4127388179119295e-09, "loss": 0.0108, "reward": 2.1354687213897705, "reward_std": 0.7957998514175415, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4792187511920929, "step": 5553 }, { "completion_length": 136.84375, "epoch": 2.9716425896201177, "grad_norm": 1.158140778541565, "kl": 0.1564163863658905, "learning_rate": 1.3609043400550359e-09, "loss": 0.0063, "reward": 1.8859062194824219, "reward_std": 0.8676710724830627, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46403124928474426, "step": 5554 }, { "completion_length": 108.125, "epoch": 2.9721776350989835, "grad_norm": 0.875260055065155, "kl": 0.19396226108074188, "learning_rate": 1.3100383825917496e-09, "loss": 0.0078, "reward": 2.619187355041504, "reward_std": 0.5716443061828613, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.49418750405311584, "step": 5555 }, { "completion_length": 146.96875, "epoch": 2.972712680577849, "grad_norm": 0.8279083967208862, "kl": 0.2021779716014862, "learning_rate": 1.2601409652393536e-09, "loss": 0.0081, "reward": 1.3835625648498535, "reward_std": 0.7187355160713196, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.44606250524520874, "step": 5556 }, { "completion_length": 148.96875, "epoch": 2.973247726056715, "grad_norm": 0.9432148337364197, "kl": 0.15842598676681519, "learning_rate": 1.2112121073384886e-09, "loss": 0.0063, "reward": 1.652500033378601, "reward_std": 0.9482616186141968, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4025000333786011, "step": 5557 }, { "completion_length": 122.78125, "epoch": 2.9737827715355807, "grad_norm": 1.0718556642532349, "kl": 0.24435566365718842, "learning_rate": 1.1632518278553717e-09, "loss": 0.0098, "reward": 2.2692813873291016, "reward_std": 1.1578058004379272, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.472406268119812, "step": 5558 }, { "completion_length": 145.3125, "epoch": 2.9743178170144464, "grad_norm": 1.2153433561325073, "kl": 0.1400119960308075, "learning_rate": 1.116260145379855e-09, "loss": 0.0056, "reward": 1.9344687461853027, "reward_std": 0.9877176284790039, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 0.43446874618530273, "step": 5559 }, { "completion_length": 134.75, "epoch": 2.9748528624933117, "grad_norm": 2.045424699783325, "kl": 0.2056257277727127, "learning_rate": 1.0702370781270898e-09, "loss": 0.0082, "reward": 2.3478751182556152, "reward_std": 1.0532174110412598, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4728749990463257, "step": 5560 }, { "completion_length": 115.71875, "epoch": 2.975387907972178, "grad_norm": 0.4690457880496979, "kl": 0.19523440301418304, "learning_rate": 1.0251826439364176e-09, "loss": 0.0078, "reward": 3.040468692779541, "reward_std": 0.6552493572235107, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.4779687523841858, "step": 5561 }, { "completion_length": 137.34375, "epoch": 2.975922953451043, "grad_norm": 1.5497148036956787, "kl": 0.23229378461837769, "learning_rate": 9.81096860271924e-10, "loss": 0.0093, "reward": 1.8509374856948853, "reward_std": 0.8762270212173462, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.47593748569488525, "step": 5562 }, { "completion_length": 130.65625, "epoch": 2.976457998929909, "grad_norm": 1.2325078248977661, "kl": 0.16341590881347656, "learning_rate": 9.379797442221616e-10, "loss": 0.0065, "reward": 2.145625114440918, "reward_std": 0.6729657649993896, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4737499952316284, "step": 5563 }, { "completion_length": 142.8125, "epoch": 2.9769930444087747, "grad_norm": 2.0965967178344727, "kl": 0.2208702564239502, "learning_rate": 8.95831312499873e-10, "loss": 0.0088, "reward": 1.855968713760376, "reward_std": 0.5283942222595215, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.46534377336502075, "step": 5564 }, { "completion_length": 133.03125, "epoch": 2.9775280898876404, "grad_norm": 0.8116036057472229, "kl": 0.20829391479492188, "learning_rate": 8.546515814425449e-10, "loss": 0.0083, "reward": 2.310781240463257, "reward_std": 0.5928313732147217, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4514062702655792, "step": 5565 }, { "completion_length": 97.0, "epoch": 2.978063135366506, "grad_norm": 2.811049461364746, "kl": 0.41704607009887695, "learning_rate": 8.144405670126865e-10, "loss": 0.0167, "reward": 2.5575623512268066, "reward_std": 0.8898782730102539, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4794374704360962, "step": 5566 }, { "completion_length": 149.09375, "epoch": 2.978598180845372, "grad_norm": 0.9029207825660706, "kl": 0.16479310393333435, "learning_rate": 7.751982847964412e-10, "loss": 0.0066, "reward": 1.8762500286102295, "reward_std": 1.014924168586731, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4231249988079071, "step": 5567 }, { "completion_length": 127.9375, "epoch": 2.9791332263242376, "grad_norm": 1.0777829885482788, "kl": 0.24988847970962524, "learning_rate": 7.369247500052523e-10, "loss": 0.01, "reward": 1.5409687757492065, "reward_std": 0.9436162710189819, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.43159377574920654, "step": 5568 }, { "completion_length": 137.96875, "epoch": 2.9796682718031033, "grad_norm": 0.7875314950942993, "kl": 0.31443142890930176, "learning_rate": 6.996199774741974e-10, "loss": 0.0126, "reward": 1.8374062776565552, "reward_std": 0.8472235202789307, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4467812478542328, "step": 5569 }, { "completion_length": 141.625, "epoch": 2.980203317281969, "grad_norm": 1.255051612854004, "kl": 0.21235357224941254, "learning_rate": 6.632839816636538e-10, "loss": 0.0085, "reward": 1.241593837738037, "reward_std": 0.4517211318016052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.42909374833106995, "step": 5570 }, { "completion_length": 128.40625, "epoch": 2.980738362760835, "grad_norm": 1.5906633138656616, "kl": 0.2292272001504898, "learning_rate": 6.279167766579108e-10, "loss": 0.0092, "reward": 2.1360626220703125, "reward_std": 0.8311108946800232, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46418750286102295, "step": 5571 }, { "completion_length": 126.875, "epoch": 2.9812734082397006, "grad_norm": 0.6424263715744019, "kl": 0.1749151647090912, "learning_rate": 5.935183761662799e-10, "loss": 0.007, "reward": 2.34375, "reward_std": 0.5234534740447998, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.5, "step": 5572 }, { "completion_length": 136.96875, "epoch": 2.981808453718566, "grad_norm": 2.189896583557129, "kl": 0.21289050579071045, "learning_rate": 5.600887935222621e-10, "loss": 0.0085, "reward": 1.8223437070846558, "reward_std": 0.7171146869659424, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.41609373688697815, "step": 5573 }, { "completion_length": 138.4375, "epoch": 2.982343499197432, "grad_norm": 1.0918749570846558, "kl": 0.14954495429992676, "learning_rate": 5.276280416832702e-10, "loss": 0.006, "reward": 2.3594374656677246, "reward_std": 1.101754903793335, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.421937495470047, "step": 5574 }, { "completion_length": 125.75, "epoch": 2.9828785446762973, "grad_norm": 0.8232557773590088, "kl": 0.1675332486629486, "learning_rate": 4.961361332322945e-10, "loss": 0.0067, "reward": 2.4297187328338623, "reward_std": 1.0465307235717773, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4765937328338623, "step": 5575 }, { "completion_length": 117.75, "epoch": 2.983413590155163, "grad_norm": 1.1642366647720337, "kl": 0.1911366730928421, "learning_rate": 4.656130803759595e-10, "loss": 0.0076, "reward": 2.3208436965942383, "reward_std": 0.8571345806121826, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.44584375619888306, "step": 5576 }, { "completion_length": 114.84375, "epoch": 2.983948635634029, "grad_norm": 3.64142107963562, "kl": 0.4386754333972931, "learning_rate": 4.3605889494563457e-10, "loss": 0.0175, "reward": 2.2543437480926514, "reward_std": 1.125776767730713, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.48871874809265137, "step": 5577 }, { "completion_length": 130.71875, "epoch": 2.9844836811128945, "grad_norm": 0.6846196055412292, "kl": 0.1333070695400238, "learning_rate": 4.0747358839687833e-10, "loss": 0.0053, "reward": 2.5635311603546143, "reward_std": 0.5080162286758423, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4697812497615814, "step": 5578 }, { "completion_length": 153.625, "epoch": 2.9850187265917603, "grad_norm": 1.7920210361480713, "kl": 0.20299768447875977, "learning_rate": 3.798571718102717e-10, "loss": 0.0081, "reward": 1.7053749561309814, "reward_std": 0.9205635190010071, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.4241250157356262, "step": 5579 }, { "completion_length": 142.3125, "epoch": 2.985553772070626, "grad_norm": 0.6405460834503174, "kl": 0.1830076277256012, "learning_rate": 3.5320965589030755e-10, "loss": 0.0073, "reward": 2.6575000286102295, "reward_std": 0.7807130217552185, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4699999988079071, "step": 5580 }, { "completion_length": 128.0, "epoch": 2.9860888175494917, "grad_norm": 0.45745962858200073, "kl": 0.17853230237960815, "learning_rate": 3.275310509659457e-10, "loss": 0.0071, "reward": 1.9845937490463257, "reward_std": 0.371957391500473, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4689687490463257, "step": 5581 }, { "completion_length": 123.8125, "epoch": 2.9866238630283575, "grad_norm": 1.062496542930603, "kl": 0.184195414185524, "learning_rate": 3.0282136699061327e-10, "loss": 0.0074, "reward": 1.7359063625335693, "reward_std": 0.5966079831123352, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4859062433242798, "step": 5582 }, { "completion_length": 113.3125, "epoch": 2.987158908507223, "grad_norm": 1.0899980068206787, "kl": 0.18389099836349487, "learning_rate": 2.790806135427593e-10, "loss": 0.0074, "reward": 2.7264063358306885, "reward_std": 0.5148686170578003, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4920312464237213, "step": 5583 }, { "completion_length": 122.9375, "epoch": 2.987693953986089, "grad_norm": 1.7073185443878174, "kl": 0.2556920647621155, "learning_rate": 2.563087998241898e-10, "loss": 0.0102, "reward": 2.321812629699707, "reward_std": 0.7794358730316162, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4936875104904175, "step": 5584 }, { "completion_length": 135.625, "epoch": 2.9882289994649547, "grad_norm": 1.130275845527649, "kl": 0.18002963066101074, "learning_rate": 2.34505934662288e-10, "loss": 0.0072, "reward": 2.47628116607666, "reward_std": 0.6982196569442749, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4606562554836273, "step": 5585 }, { "completion_length": 138.25, "epoch": 2.98876404494382, "grad_norm": 0.9666663408279419, "kl": 0.25306442379951477, "learning_rate": 2.1367202650779407e-10, "loss": 0.0101, "reward": 1.9299688339233398, "reward_std": 0.6574004888534546, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4924687445163727, "step": 5586 }, { "completion_length": 122.28125, "epoch": 2.989299090422686, "grad_norm": 1.1775659322738647, "kl": 0.24241037666797638, "learning_rate": 1.938070834361927e-10, "loss": 0.0097, "reward": 2.546875, "reward_std": 0.5877175331115723, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.5, "step": 5587 }, { "completion_length": 140.90625, "epoch": 2.9898341359015514, "grad_norm": 3.610806941986084, "kl": 0.3350062966346741, "learning_rate": 1.7491111314799082e-10, "loss": 0.0134, "reward": 1.6617813110351562, "reward_std": 0.5586051940917969, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4430312514305115, "step": 5588 }, { "completion_length": 114.75, "epoch": 2.990369181380417, "grad_norm": 0.6184707880020142, "kl": 0.22925442457199097, "learning_rate": 1.5698412296760724e-10, "loss": 0.0092, "reward": 2.381812572479248, "reward_std": 0.8653146028518677, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.47556251287460327, "step": 5589 }, { "completion_length": 139.03125, "epoch": 2.990904226859283, "grad_norm": 0.7820759415626526, "kl": 0.2473164200782776, "learning_rate": 1.4002611984337277e-10, "loss": 0.0099, "reward": 1.7921249866485596, "reward_std": 0.9106019139289856, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 0.44837498664855957, "step": 5590 }, { "completion_length": 128.5, "epoch": 2.9914392723381487, "grad_norm": 0.921036958694458, "kl": 0.19568711519241333, "learning_rate": 1.2403711034891796e-10, "loss": 0.0078, "reward": 1.837937593460083, "reward_std": 0.7306870222091675, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.46293750405311584, "step": 5591 }, { "completion_length": 129.625, "epoch": 2.9919743178170144, "grad_norm": 1.5610949993133545, "kl": 0.2166537046432495, "learning_rate": 1.0901710068206283e-10, "loss": 0.0087, "reward": 1.984375, "reward_std": 0.2414703369140625, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.5, "step": 5592 }, { "completion_length": 137.96875, "epoch": 2.99250936329588, "grad_norm": 0.5215712785720825, "kl": 0.1553751528263092, "learning_rate": 9.496609666453937e-11, "loss": 0.0062, "reward": 2.5360000133514404, "reward_std": 0.40850549936294556, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.47350001335144043, "step": 5593 }, { "completion_length": 135.625, "epoch": 2.993044408774746, "grad_norm": 3.3613874912261963, "kl": 0.5111228227615356, "learning_rate": 8.188410374282418e-11, "loss": 0.0204, "reward": 2.1311874389648438, "reward_std": 0.7350854277610779, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 0.4436874985694885, "step": 5594 }, { "completion_length": 119.53125, "epoch": 2.9935794542536116, "grad_norm": 1.1502330303192139, "kl": 0.22341300547122955, "learning_rate": 6.977112698758337e-11, "loss": 0.0089, "reward": 2.047187328338623, "reward_std": 0.8706040978431702, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4690625071525574, "step": 5595 }, { "completion_length": 126.15625, "epoch": 2.9941144997324773, "grad_norm": 2.724332571029663, "kl": 0.4711132049560547, "learning_rate": 5.862717109450521e-11, "loss": 0.0188, "reward": 2.5655624866485596, "reward_std": 0.8371579647064209, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.48743748664855957, "step": 5596 }, { "completion_length": 135.875, "epoch": 2.994649545211343, "grad_norm": 0.6868988275527954, "kl": 0.15030977129936218, "learning_rate": 4.8452240382912406e-11, "loss": 0.006, "reward": 2.0218749046325684, "reward_std": 0.5944159626960754, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.4749999940395355, "step": 5597 }, { "completion_length": 115.6875, "epoch": 2.995184590690209, "grad_norm": 1.253498911857605, "kl": 0.21428252756595612, "learning_rate": 3.924633879687223e-11, "loss": 0.0086, "reward": 1.8125624656677246, "reward_std": 0.5717213749885559, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.484437495470047, "step": 5598 }, { "completion_length": 104.21875, "epoch": 2.995719636169074, "grad_norm": 0.8091989755630493, "kl": 0.27473607659339905, "learning_rate": 3.1009469904641484e-11, "loss": 0.011, "reward": 2.8667500019073486, "reward_std": 1.1173161268234253, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.47612500190734863, "step": 5599 }, { "completion_length": 120.125, "epoch": 2.9962546816479403, "grad_norm": 1.1251611709594727, "kl": 0.21770061552524567, "learning_rate": 2.37416368992216e-11, "loss": 0.0087, "reward": 2.66015625, "reward_std": 0.6329216957092285, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.47265625, "step": 5600 }, { "completion_length": 140.0, "epoch": 2.9967897271268056, "grad_norm": 2.8011364936828613, "kl": 0.22881291806697845, "learning_rate": 1.7442842597525935e-11, "loss": 0.0092, "reward": 2.060093879699707, "reward_std": 0.980509340763092, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4350937604904175, "step": 5601 }, { "completion_length": 100.3125, "epoch": 2.9973247726056713, "grad_norm": 1.1735130548477173, "kl": 0.24567236006259918, "learning_rate": 1.2113089441212478e-11, "loss": 0.0098, "reward": 3.337625026702881, "reward_std": 0.3996531367301941, "rewards/correctness_reward_func": 1.875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.49387499690055847, "step": 5602 }, { "completion_length": 129.75, "epoch": 2.997859818084537, "grad_norm": 1.822002649307251, "kl": 0.20242729783058167, "learning_rate": 7.752379496128726e-12, "loss": 0.0081, "reward": 2.0311875343322754, "reward_std": 0.6584835052490234, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.499937504529953, "step": 5603 }, { "completion_length": 105.875, "epoch": 2.998394863563403, "grad_norm": 1.4363813400268555, "kl": 0.3524048626422882, "learning_rate": 4.360714452589232e-12, "loss": 0.0141, "reward": 3.1039061546325684, "reward_std": 0.7007251977920532, "rewards/correctness_reward_func": 1.6875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 0.4945312440395355, "step": 5604 }, { "completion_length": 129.90625, "epoch": 2.9989299090422685, "grad_norm": 1.2552770376205444, "kl": 0.21106834709644318, "learning_rate": 1.9380956253756134e-12, "loss": 0.0084, "reward": 1.9634687900543213, "reward_std": 0.7782601118087769, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4634687304496765, "step": 5605 }, { "completion_length": 158.875, "epoch": 2.9994649545211343, "grad_norm": 5.740185737609863, "kl": 0.18711739778518677, "learning_rate": 4.845239531814372e-13, "loss": 0.0075, "reward": 1.6056874990463257, "reward_std": 0.7449169158935547, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 0.4494374990463257, "step": 5606 }, { "completion_length": 93.0, "epoch": 3.0, "grad_norm": 0.9452321529388428, "kl": 0.12843526899814606, "learning_rate": 0.0, "loss": 0.0051, "reward": 3.5, "reward_std": 0.9977653622627258, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 5607 } ], "logging_steps": 1, "max_steps": 5607, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }