{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.13381506757660913, "eval_steps": 500, "global_step": 250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 281.1666851043701, "epoch": 0.0005352602703064365, "grad_norm": 0.5917035937309265, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0, "reward": -0.10487502068281174, "reward_std": 0.644918380305171, "rewards/correctness_reward_func": 0.1666666716337204, "rewards/int_reward_func": 0.1041666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.37570834904909134, "step": 1 }, { "completion_length": 590.8750171661377, "epoch": 0.001070520540612873, "grad_norm": 0.7477704286575317, "kl": 0.0, "learning_rate": 2.0000000000000002e-07, "loss": 0.0, "reward": 0.22162500163540244, "reward_std": 0.09485530573874712, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09662500163540244, "step": 2 }, { "completion_length": 539.6666870117188, "epoch": 0.0016057808109193096, "grad_norm": 0.4429571032524109, "kl": 0.002077269156870898, "learning_rate": 4.0000000000000003e-07, "loss": 0.0001, "reward": 0.016208335757255554, "reward_std": 0.6246479228138924, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.2083333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1921250014565885, "step": 3 }, { "completion_length": 185.7916717529297, "epoch": 0.002141041081225746, "grad_norm": 0.6855182647705078, "kl": 0.0009879921708488837, "learning_rate": 6.000000000000001e-07, "loss": 0.0, "reward": 1.0669583305716515, "reward_std": 0.5203845072537661, "rewards/correctness_reward_func": 0.7500000298023224, "rewards/int_reward_func": 0.3541666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.03720833268016577, "step": 4 }, { "completion_length": 587.3750076293945, "epoch": 0.0026763013515321826, "grad_norm": 0.7880218029022217, "kl": 0.0007925744503154419, "learning_rate": 8.000000000000001e-07, "loss": 0.0, "reward": 0.09533333079889417, "reward_std": 0.5927679911255836, "rewards/correctness_reward_func": 0.0833333358168602, "rewards/int_reward_func": 0.1041666679084301, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09216667525470257, "step": 5 }, { "completion_length": 245.70834159851074, "epoch": 0.003211561621838619, "grad_norm": 0.9220851063728333, "kl": 0.001330614773905836, "learning_rate": 1.0000000000000002e-06, "loss": 0.0001, "reward": 0.08295834437012672, "reward_std": 0.5829638005234301, "rewards/correctness_reward_func": 0.0833333358168602, "rewards/int_reward_func": 0.0416666679084301, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.04204164445400238, "step": 6 }, { "completion_length": 448.7083549499512, "epoch": 0.0037468218921450553, "grad_norm": 0.8303655385971069, "kl": 0.0012542481999844313, "learning_rate": 1.2000000000000002e-06, "loss": 0.0001, "reward": 0.03099999949336052, "reward_std": 0.7598095312714577, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.16666666977107525, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.13566668145358562, "step": 7 }, { "completion_length": 197.95833587646484, "epoch": 0.004282082162451492, "grad_norm": 1.0582274198532104, "kl": 0.0008733256690902635, "learning_rate": 1.4000000000000001e-06, "loss": 0.0, "reward": 0.12341666966676712, "reward_std": 0.09142086654901505, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12341666780412197, "step": 8 }, { "completion_length": 176.41666984558105, "epoch": 0.004817342432757929, "grad_norm": 1.0369133949279785, "kl": 0.005098502180771902, "learning_rate": 1.6000000000000001e-06, "loss": 0.0002, "reward": 0.32220835238695145, "reward_std": 0.3896215371787548, "rewards/correctness_reward_func": 0.0833333358168602, "rewards/int_reward_func": 0.125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11387500539422035, "step": 9 }, { "completion_length": 267.66667556762695, "epoch": 0.005352602703064365, "grad_norm": 0.4552709460258484, "kl": 0.001112774269131478, "learning_rate": 1.8000000000000001e-06, "loss": 0.0, "reward": 0.31854166090488434, "reward_std": 0.3717608004808426, "rewards/correctness_reward_func": 0.1666666716337204, "rewards/int_reward_func": 0.1250000037252903, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.02687499998137355, "step": 10 }, { "completion_length": 319.58333587646484, "epoch": 0.005887862973370801, "grad_norm": 0.7992357015609741, "kl": 0.0012863876472692937, "learning_rate": 2.0000000000000003e-06, "loss": 0.0001, "reward": 0.2948333490639925, "reward_std": 0.38963131979107857, "rewards/correctness_reward_func": 0.0833333358168602, "rewards/int_reward_func": 0.10416666977107525, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10733332857489586, "step": 11 }, { "completion_length": 375.29166984558105, "epoch": 0.006423123243677238, "grad_norm": 0.965032160282135, "kl": 0.0008803782802715432, "learning_rate": 2.2e-06, "loss": 0.0, "reward": 1.012750007212162, "reward_std": 0.5616761147975922, "rewards/correctness_reward_func": 0.8333333432674408, "rewards/int_reward_func": 0.22916666977107525, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.04975000023841858, "step": 12 }, { "completion_length": 482.4583435058594, "epoch": 0.0069583835139836745, "grad_norm": 1.502334713935852, "kl": 0.0028791724907932803, "learning_rate": 2.4000000000000003e-06, "loss": 0.0001, "reward": -0.32758333161473274, "reward_std": 0.7293146029114723, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.02083333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.34841667115688324, "step": 13 }, { "completion_length": 236.41667366027832, "epoch": 0.007493643784290111, "grad_norm": 0.8761454820632935, "kl": 0.0018893379892688245, "learning_rate": 2.6e-06, "loss": 0.0001, "reward": -0.017458327114582062, "reward_std": 0.31026666425168514, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.06250000186264515, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.07995832804590464, "step": 14 }, { "completion_length": 306.6250104904175, "epoch": 0.008028904054596548, "grad_norm": 1.070983648300171, "kl": 0.0017425262776669115, "learning_rate": 2.8000000000000003e-06, "loss": 0.0001, "reward": -0.08808333426713943, "reward_std": 0.37953382171690464, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08808333426713943, "step": 15 }, { "completion_length": 366.6666793823242, "epoch": 0.008564164324902984, "grad_norm": 0.6342064738273621, "kl": 0.0009713478648336604, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.03991668112576008, "reward_std": 0.8613052181899548, "rewards/correctness_reward_func": 0.1666666716337204, "rewards/int_reward_func": 0.0416666679084301, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.16841666959226131, "step": 16 }, { "completion_length": 297.75000381469727, "epoch": 0.00909942459520942, "grad_norm": 0.629483163356781, "kl": 0.0008868449804140255, "learning_rate": 3.2000000000000003e-06, "loss": 0.0, "reward": 1.0445833504199982, "reward_std": 0.3152644243091345, "rewards/correctness_reward_func": 0.6666666716337204, "rewards/int_reward_func": 0.3333333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.044583337381482124, "step": 17 }, { "completion_length": 185.79166793823242, "epoch": 0.009634684865515858, "grad_norm": 1.0232924222946167, "kl": 0.0012610588310053572, "learning_rate": 3.4000000000000005e-06, "loss": 0.0001, "reward": 0.21658334136009216, "reward_std": 0.18211832642555237, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15408333763480186, "step": 18 }, { "completion_length": 361.5416679382324, "epoch": 0.010169945135822294, "grad_norm": 0.8113482594490051, "kl": 0.0015902465383987874, "learning_rate": 3.6000000000000003e-06, "loss": 0.0001, "reward": 0.44666668586432934, "reward_std": 0.5656739473342896, "rewards/correctness_reward_func": 0.2500000074505806, "rewards/int_reward_func": 0.18750000558793545, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.009166665724478662, "step": 19 }, { "completion_length": 254.66666984558105, "epoch": 0.01070520540612873, "grad_norm": 1.1768397092819214, "kl": 0.004655150449252687, "learning_rate": 3.8000000000000005e-06, "loss": 0.0002, "reward": 0.48504166305065155, "reward_std": 0.4368314128369093, "rewards/correctness_reward_func": 0.3333333432674408, "rewards/int_reward_func": 0.0833333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.06837500259280205, "step": 20 }, { "completion_length": 137.91667366027832, "epoch": 0.011240465676435166, "grad_norm": 1.0621542930603027, "kl": 0.0022582018573302776, "learning_rate": 4.000000000000001e-06, "loss": 0.0001, "reward": 0.3660416714847088, "reward_std": 0.5700555201619864, "rewards/correctness_reward_func": 0.1666666716337204, "rewards/int_reward_func": 0.1875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.011874992400407791, "step": 21 }, { "completion_length": 88.91667175292969, "epoch": 0.011775725946741603, "grad_norm": 1.2175222635269165, "kl": 0.0020840048528043553, "learning_rate": 4.2000000000000004e-06, "loss": 0.0001, "reward": 0.8160417033359408, "reward_std": 0.26041645370423794, "rewards/correctness_reward_func": 0.4166666865348816, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14937500189989805, "step": 22 }, { "completion_length": 150.2083339691162, "epoch": 0.01231098621704804, "grad_norm": 0.8459586501121521, "kl": 0.0017513818893348798, "learning_rate": 4.4e-06, "loss": 0.0001, "reward": 0.5479583460837603, "reward_std": 0.6487118303775787, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.2916666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0062916697934269905, "step": 23 }, { "completion_length": 154.87500381469727, "epoch": 0.012846246487354477, "grad_norm": 0.8675287961959839, "kl": 0.001371489226585254, "learning_rate": 4.600000000000001e-06, "loss": 0.0001, "reward": 0.17075001262128353, "reward_std": 0.19458706118166447, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.14583333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.024916673079133034, "step": 24 }, { "completion_length": 150.7500057220459, "epoch": 0.013381506757660913, "grad_norm": 0.8717443943023682, "kl": 0.0020001856610178947, "learning_rate": 4.800000000000001e-06, "loss": 0.0001, "reward": 0.20204169023782015, "reward_std": 0.4490640014410019, "rewards/correctness_reward_func": 0.1666666716337204, "rewards/int_reward_func": 0.06250000186264515, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.027124996297061443, "step": 25 }, { "completion_length": 467.9583568572998, "epoch": 0.013916767027967349, "grad_norm": 1.1116266250610352, "kl": 0.0012415697274263948, "learning_rate": 5e-06, "loss": 0.0, "reward": 0.304541677236557, "reward_std": 0.28451096825301647, "rewards/correctness_reward_func": 0.0833333358168602, "rewards/int_reward_func": 0.14583333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.07537499908357859, "step": 26 }, { "completion_length": 433.5833339691162, "epoch": 0.014452027298273785, "grad_norm": 0.8203855156898499, "kl": 0.001017560571199283, "learning_rate": 4.999756310023261e-06, "loss": 0.0, "reward": 0.2795000094920397, "reward_std": 0.11828233953565359, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.14583333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13366666994988918, "step": 27 }, { "completion_length": 126.00000381469727, "epoch": 0.014987287568580221, "grad_norm": 1.2663164138793945, "kl": 0.0014343319344334304, "learning_rate": 4.999025287600886e-06, "loss": 0.0001, "reward": 0.6117500364780426, "reward_std": 0.4831337593495846, "rewards/correctness_reward_func": 0.1666666716337204, "rewards/int_reward_func": 0.2083333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2367500104010105, "step": 28 }, { "completion_length": 229.66667556762695, "epoch": 0.01552254783888666, "grad_norm": 0.8950901627540588, "kl": 0.0015667550032958388, "learning_rate": 4.997807075247147e-06, "loss": 0.0001, "reward": 0.1262916720006615, "reward_std": 0.148749228566885, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.02083333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1054583361838013, "step": 29 }, { "completion_length": 210.8750114440918, "epoch": 0.016057808109193095, "grad_norm": 1.1234443187713623, "kl": 0.0019746975012822077, "learning_rate": 4.996101910454953e-06, "loss": 0.0001, "reward": 0.3709999993443489, "reward_std": 0.3687104620039463, "rewards/correctness_reward_func": 0.0833333358168602, "rewards/int_reward_func": 0.20833333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0793333351612091, "step": 30 }, { "completion_length": 177.62500190734863, "epoch": 0.01659306837949953, "grad_norm": 0.9238327145576477, "kl": 0.0028813415410695598, "learning_rate": 4.993910125649561e-06, "loss": 0.0001, "reward": 0.39008335024118423, "reward_std": 0.3615882135927677, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.27083333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11925000231713057, "step": 31 }, { "completion_length": 173.79167366027832, "epoch": 0.017128328649805968, "grad_norm": 1.0448777675628662, "kl": 0.006160023040138185, "learning_rate": 4.9912321481237616e-06, "loss": 0.0002, "reward": 0.14554167166352272, "reward_std": 0.16822291910648346, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0416666679084301, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10387500375509262, "step": 32 }, { "completion_length": 486.75000762939453, "epoch": 0.017663588920112404, "grad_norm": 0.6601158380508423, "kl": 0.001103398812119849, "learning_rate": 4.988068499954578e-06, "loss": 0.0, "reward": -0.5304166711866856, "reward_std": 1.0785409808158875, "rewards/correctness_reward_func": 0.0833333358168602, "rewards/int_reward_func": 0.06250000186264515, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6762500237673521, "step": 33 }, { "completion_length": 268.12501525878906, "epoch": 0.01819884919041884, "grad_norm": 0.910798966884613, "kl": 0.0013230827316874638, "learning_rate": 4.984419797901491e-06, "loss": 0.0001, "reward": 0.6703750789165497, "reward_std": 0.36514274775981903, "rewards/correctness_reward_func": 0.4166666865348816, "rewards/int_reward_func": 0.12500000558793545, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1287083402276039, "step": 34 }, { "completion_length": 267.708345413208, "epoch": 0.018734109460725276, "grad_norm": 1.027286171913147, "kl": 0.004269710392691195, "learning_rate": 4.980286753286196e-06, "loss": 0.0002, "reward": 0.4232500046491623, "reward_std": 0.2645573355257511, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.27083334140479565, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1524166688323021, "step": 35 }, { "completion_length": 245.8750114440918, "epoch": 0.019269369731031716, "grad_norm": 1.0046695470809937, "kl": 0.005482323234900832, "learning_rate": 4.975670171853926e-06, "loss": 0.0002, "reward": 0.2149583324790001, "reward_std": 0.7910580635070801, "rewards/correctness_reward_func": 0.0833333358168602, "rewards/int_reward_func": 0.2083333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.07670832984149456, "step": 36 }, { "completion_length": 560.7916870117188, "epoch": 0.019804630001338152, "grad_norm": 0.8885159492492676, "kl": 0.002885772308218293, "learning_rate": 4.970570953616383e-06, "loss": 0.0001, "reward": 0.1944583347067237, "reward_std": 0.05626108031719923, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1944583347067237, "step": 37 }, { "completion_length": 522.8750133514404, "epoch": 0.020339890271644588, "grad_norm": 0.5502146482467651, "kl": 0.009517412836430594, "learning_rate": 4.964990092676263e-06, "loss": 0.0004, "reward": 0.2919999957084656, "reward_std": 0.674026682972908, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08299997518770397, "step": 38 }, { "completion_length": 640.2083511352539, "epoch": 0.020875150541951024, "grad_norm": 0.7062350511550903, "kl": 0.0034277847153134644, "learning_rate": 4.958928677033465e-06, "loss": 0.0001, "reward": -0.004291646182537079, "reward_std": 0.7502853199839592, "rewards/correctness_reward_func": 0.0833333358168602, "rewards/int_reward_func": 0.1250000037252903, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.21262501180171967, "step": 39 }, { "completion_length": 491.08335876464844, "epoch": 0.02141041081225746, "grad_norm": 0.7743443250656128, "kl": 0.004902548622339964, "learning_rate": 4.9523878883729794e-06, "loss": 0.0002, "reward": 0.06212499737739563, "reward_std": 0.8643132671713829, "rewards/correctness_reward_func": 0.0833333358168602, "rewards/int_reward_func": 0.1458333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1670416765846312, "step": 40 }, { "completion_length": 301.45834159851074, "epoch": 0.021945671082563897, "grad_norm": 0.7774906754493713, "kl": 0.007360402669291943, "learning_rate": 4.9453690018345144e-06, "loss": 0.0003, "reward": -0.09349998086690903, "reward_std": 0.7777874618768692, "rewards/correctness_reward_func": 0.0833333358168602, "rewards/int_reward_func": 0.1041666679084301, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.2810000069439411, "step": 41 }, { "completion_length": 455.5833435058594, "epoch": 0.022480931352870333, "grad_norm": 0.6825084090232849, "kl": 0.0033266296959482133, "learning_rate": 4.937873385763909e-06, "loss": 0.0001, "reward": 0.2603750079870224, "reward_std": 1.1693618446588516, "rewards/correctness_reward_func": 0.2500000074505806, "rewards/int_reward_func": 0.1458333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.02083333395421505, "rewards/xmlcount_reward_func": -0.15629167575389147, "step": 42 }, { "completion_length": 254.7500057220459, "epoch": 0.02301619162317677, "grad_norm": 1.0424985885620117, "kl": 0.013458715460728854, "learning_rate": 4.9299025014463665e-06, "loss": 0.0005, "reward": 0.6490416824817657, "reward_std": 0.6297374591231346, "rewards/correctness_reward_func": 0.1666666716337204, "rewards/int_reward_func": 0.3125000111758709, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16987500933464617, "step": 43 }, { "completion_length": 243.8333339691162, "epoch": 0.023551451893483205, "grad_norm": 0.6618691682815552, "kl": 0.013514326536096632, "learning_rate": 4.921457902821578e-06, "loss": 0.0005, "reward": 0.235000004991889, "reward_std": 0.4793561212718487, "rewards/correctness_reward_func": 0.0833333358168602, "rewards/int_reward_func": 0.12500000558793545, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.02666667103767395, "step": 44 }, { "completion_length": 247.1250057220459, "epoch": 0.02408671216378964, "grad_norm": 0.8150098919868469, "kl": 0.006718623684719205, "learning_rate": 4.912541236180779e-06, "loss": 0.0003, "reward": 0.3942916840314865, "reward_std": 0.4090446010231972, "rewards/correctness_reward_func": 0.1666666716337204, "rewards/int_reward_func": 0.1458333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08179166866466403, "step": 45 }, { "completion_length": 311.95833587646484, "epoch": 0.02462197243409608, "grad_norm": 0.9185469746589661, "kl": 0.011615818890277296, "learning_rate": 4.903154239845798e-06, "loss": 0.0005, "reward": 0.6513333357870579, "reward_std": 0.26453326642513275, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.14583333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.005500006955116987, "step": 46 }, { "completion_length": 347.25000762939453, "epoch": 0.025157232704402517, "grad_norm": 0.9208869338035583, "kl": 0.019979659002274275, "learning_rate": 4.893298743830168e-06, "loss": 0.0008, "reward": 0.4205416589975357, "reward_std": 0.4555768258869648, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.108041662722826, "step": 47 }, { "completion_length": 245.6666717529297, "epoch": 0.025692492974708953, "grad_norm": 0.8743010759353638, "kl": 0.01431413902901113, "learning_rate": 4.882976669482368e-06, "loss": 0.0006, "reward": 0.269333329051733, "reward_std": 0.6685181586071849, "rewards/correctness_reward_func": 0.0833333358168602, "rewards/int_reward_func": 0.125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.06099999323487282, "step": 48 }, { "completion_length": 431.12501525878906, "epoch": 0.02622775324501539, "grad_norm": 0.6829207539558411, "kl": 0.010582708870060742, "learning_rate": 4.8721900291112415e-06, "loss": 0.0004, "reward": 1.1573750227689743, "reward_std": 0.5191336497664452, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.27083333767950535, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1365416720509529, "step": 49 }, { "completion_length": 362.2916793823242, "epoch": 0.026763013515321826, "grad_norm": 0.9591223001480103, "kl": 0.021306635811924934, "learning_rate": 4.860940925593703e-06, "loss": 0.0009, "reward": 0.5036666616797447, "reward_std": 0.6060219556093216, "rewards/correctness_reward_func": 0.0833333358168602, "rewards/int_reward_func": 0.16666666977107525, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.06250000186264515, "rewards/xmlcount_reward_func": 0.19116667530033737, "step": 50 }, { "completion_length": 105.25, "epoch": 0.02729827378562826, "grad_norm": 1.2040213346481323, "kl": 0.03592631733044982, "learning_rate": 4.849231551964771e-06, "loss": 0.0014, "reward": 0.9875416681170464, "reward_std": 0.5127010717988014, "rewards/correctness_reward_func": 0.4166666865348816, "rewards/int_reward_func": 0.20833334140479565, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0416666679084301, "rewards/xmlcount_reward_func": 0.32087502628564835, "step": 51 }, { "completion_length": 105.50000190734863, "epoch": 0.027833534055934698, "grad_norm": 0.9755409359931946, "kl": 0.03177254740148783, "learning_rate": 4.837064190990036e-06, "loss": 0.0013, "reward": 0.36112499982118607, "reward_std": 0.23680441081523895, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.14583333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.02083333395421505, "rewards/xmlcount_reward_func": 0.19445833936333656, "step": 52 }, { "completion_length": 115.08333778381348, "epoch": 0.028368794326241134, "grad_norm": 1.0774939060211182, "kl": 0.04096163995563984, "learning_rate": 4.824441214720629e-06, "loss": 0.0016, "reward": 0.39137500151991844, "reward_std": 0.21243570372462273, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0416666679084301, "rewards/xmlcount_reward_func": 0.22470833733677864, "step": 53 }, { "completion_length": 71.87500095367432, "epoch": 0.02890405459654757, "grad_norm": 1.5860177278518677, "kl": 0.07264666631817818, "learning_rate": 4.811365084030784e-06, "loss": 0.0029, "reward": 0.7297500222921371, "reward_std": 0.40594012290239334, "rewards/correctness_reward_func": 0.0833333358168602, "rewards/int_reward_func": 0.25000000558793545, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0416666679084301, "rewards/xmlcount_reward_func": 0.35475001111626625, "step": 54 }, { "completion_length": 335.0416831970215, "epoch": 0.029439314866854006, "grad_norm": 0.8996623158454895, "kl": 0.03017168352380395, "learning_rate": 4.7978383481380865e-06, "loss": 0.0012, "reward": 0.21566667163278908, "reward_std": 0.38637750223279, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0416666679084301, "rewards/xmlcount_reward_func": 0.11149999999906868, "step": 55 }, { "completion_length": 116.00000476837158, "epoch": 0.029974575137160443, "grad_norm": 1.1853581666946411, "kl": 0.03844497771933675, "learning_rate": 4.783863644106502e-06, "loss": 0.0015, "reward": 0.35095833986997604, "reward_std": 0.21118691470474005, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0416666679084301, "rewards/xmlcount_reward_func": 0.18429166823625565, "step": 56 }, { "completion_length": 203.58334350585938, "epoch": 0.030509835407466882, "grad_norm": 0.9677571654319763, "kl": 0.03654019068926573, "learning_rate": 4.769443696332272e-06, "loss": 0.0015, "reward": 0.867875000461936, "reward_std": 0.6674522012472153, "rewards/correctness_reward_func": 0.3333333432674408, "rewards/int_reward_func": 0.2708333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.08333333395421505, "rewards/xmlcount_reward_func": 0.1803750041872263, "step": 57 }, { "completion_length": 226.58334159851074, "epoch": 0.03104509567777332, "grad_norm": 0.7546242475509644, "kl": 0.10618894919753075, "learning_rate": 4.754581316012785e-06, "loss": 0.0042, "reward": 0.7405833136290312, "reward_std": 1.0614993423223495, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.2708333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.125, "rewards/xmlcount_reward_func": 0.09474998340010643, "step": 58 }, { "completion_length": 96.75000190734863, "epoch": 0.03158035594807975, "grad_norm": 0.9959896206855774, "kl": 0.040049958042800426, "learning_rate": 4.7392794005985324e-06, "loss": 0.0016, "reward": 1.0202916860580444, "reward_std": 0.5125212594866753, "rewards/correctness_reward_func": 0.2500000074505806, "rewards/int_reward_func": 0.4166666679084301, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35362500697374344, "step": 59 }, { "completion_length": 77.83333587646484, "epoch": 0.03211561621838619, "grad_norm": 1.0476152896881104, "kl": 0.08419935218989849, "learning_rate": 4.723540933228245e-06, "loss": 0.0034, "reward": 0.6385000422596931, "reward_std": 0.23676574788987637, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.27083333767950535, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36766667664051056, "step": 60 }, { "completion_length": 76.0416669845581, "epoch": 0.03265087648869262, "grad_norm": 1.6621463298797607, "kl": 0.06288609141483903, "learning_rate": 4.707368982147318e-06, "loss": 0.0025, "reward": 0.7950416952371597, "reward_std": 0.2815567087382078, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.2916666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.1458333395421505, "rewards/xmlcount_reward_func": 0.35754168033599854, "step": 61 }, { "completion_length": 97.87500190734863, "epoch": 0.03318613675899906, "grad_norm": 1.3513109683990479, "kl": 0.07356535829603672, "learning_rate": 4.690766700109659e-06, "loss": 0.0029, "reward": 1.054708331823349, "reward_std": 0.3879717066884041, "rewards/correctness_reward_func": 0.1666666716337204, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.06250000186264515, "rewards/xmlcount_reward_func": 0.3463750060182065, "step": 62 }, { "completion_length": 356.79167556762695, "epoch": 0.0337213970293055, "grad_norm": 0.9732988476753235, "kl": 0.0708354264497757, "learning_rate": 4.673737323763048e-06, "loss": 0.0028, "reward": 0.3889166936278343, "reward_std": 1.0406904257833958, "rewards/correctness_reward_func": 0.1666666716337204, "rewards/int_reward_func": 0.0833333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0833333358168602, "rewards/xmlcount_reward_func": 0.05558334290981293, "step": 63 }, { "completion_length": 124.16666984558105, "epoch": 0.034256657299611935, "grad_norm": 1.4622650146484375, "kl": 0.08344197925180197, "learning_rate": 4.656284173018144e-06, "loss": 0.0033, "reward": 1.0468750149011612, "reward_std": 0.7830385342240334, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.2708333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.16666666977107525, "rewards/xmlcount_reward_func": 0.3593750074505806, "step": 64 }, { "completion_length": 233.333345413208, "epoch": 0.034791917569918375, "grad_norm": 2.0504956245422363, "kl": 0.07171727810055017, "learning_rate": 4.638410650401267e-06, "loss": 0.0029, "reward": 1.2617916613817215, "reward_std": 0.8660007119178772, "rewards/correctness_reward_func": 0.5000000149011612, "rewards/int_reward_func": 0.3750000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0625, "rewards/xmlcount_reward_func": 0.3242916837334633, "step": 65 }, { "completion_length": 147.5833339691162, "epoch": 0.03532717784022481, "grad_norm": 0.6744219660758972, "kl": 0.11512961238622665, "learning_rate": 4.620120240391065e-06, "loss": 0.0046, "reward": 0.8016250282526016, "reward_std": 0.29210690781474113, "rewards/correctness_reward_func": 0.0833333358168602, "rewards/int_reward_func": 0.2291666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0416666679084301, "rewards/xmlcount_reward_func": 0.44745834171772003, "step": 66 }, { "completion_length": 76.41666793823242, "epoch": 0.03586243811053125, "grad_norm": 0.8137429356575012, "kl": 0.11052755452692509, "learning_rate": 4.601416508739211e-06, "loss": 0.0044, "reward": 1.2232083678245544, "reward_std": 0.6085939556360245, "rewards/correctness_reward_func": 0.3333333432674408, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.18750000558793545, "rewards/xmlcount_reward_func": 0.45237500965595245, "step": 67 }, { "completion_length": 103.91666984558105, "epoch": 0.03639769838083768, "grad_norm": 1.3938682079315186, "kl": 0.10067875497043133, "learning_rate": 4.582303101775249e-06, "loss": 0.004, "reward": 0.7972083538770676, "reward_std": 0.43218278884887695, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.14583333767950535, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.22916667349636555, "rewards/xmlcount_reward_func": 0.42220834642648697, "step": 68 }, { "completion_length": 607.5000114440918, "epoch": 0.03693295865114412, "grad_norm": 0.7114788293838501, "kl": 0.059856235020561144, "learning_rate": 4.562783745695738e-06, "loss": 0.0024, "reward": 0.6020833402872086, "reward_std": 0.26822593063116074, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.1875000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.1041666716337204, "rewards/xmlcount_reward_func": 0.31041666865348816, "step": 69 }, { "completion_length": 366.1666736602783, "epoch": 0.03746821892145055, "grad_norm": 1.5701572895050049, "kl": 0.12799374386668205, "learning_rate": 4.542862245837821e-06, "loss": 0.0051, "reward": 0.6387916915118694, "reward_std": 0.3272341303527355, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.16666667349636555, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.14583333395421505, "rewards/xmlcount_reward_func": 0.3262916784733534, "step": 70 }, { "completion_length": 150.9166717529297, "epoch": 0.03800347919175699, "grad_norm": 1.3559887409210205, "kl": 0.17991142719984055, "learning_rate": 4.522542485937369e-06, "loss": 0.0072, "reward": 0.8541667014360428, "reward_std": 0.28774577379226685, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.2500000074505806, "rewards/xmlcount_reward_func": 0.4791666716337204, "step": 71 }, { "completion_length": 92.08333778381348, "epoch": 0.03853873946206343, "grad_norm": 0.9042914509773254, "kl": 0.21130416169762611, "learning_rate": 4.501828427371834e-06, "loss": 0.0085, "reward": 0.7811249941587448, "reward_std": 0.20265305042266846, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.1666666679084301, "rewards/xmlcount_reward_func": 0.4894583374261856, "step": 72 }, { "completion_length": 125.45833778381348, "epoch": 0.039073999732369864, "grad_norm": 0.8635703921318054, "kl": 0.13566209375858307, "learning_rate": 4.4807241083879774e-06, "loss": 0.0054, "reward": 1.1882500350475311, "reward_std": 0.8479792177677155, "rewards/correctness_reward_func": 0.3333333432674408, "rewards/int_reward_func": 0.1875000037252903, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.2708333358168602, "rewards/xmlcount_reward_func": 0.39658334106206894, "step": 73 }, { "completion_length": 68.79166984558105, "epoch": 0.039609260002676304, "grad_norm": 1.192706823348999, "kl": 0.27350207418203354, "learning_rate": 4.4592336433146e-06, "loss": 0.0109, "reward": 1.7708333730697632, "reward_std": 0.11558076366782188, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 74 }, { "completion_length": 90.79166984558105, "epoch": 0.04014452027298274, "grad_norm": 1.3415632247924805, "kl": 0.15339597314596176, "learning_rate": 4.437361221760449e-06, "loss": 0.0061, "reward": 1.0463333874940872, "reward_std": 0.5073548853397369, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.2083333358168602, "rewards/xmlcount_reward_func": 0.46300000697374344, "step": 75 }, { "completion_length": 163.20834159851074, "epoch": 0.040679780543289176, "grad_norm": 1.5307588577270508, "kl": 0.1565667698159814, "learning_rate": 4.415111107797445e-06, "loss": 0.0063, "reward": 1.239583358168602, "reward_std": 0.48138320073485374, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.2083333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125000074505806, "rewards/xmlcount_reward_func": 0.4687500074505806, "step": 76 }, { "completion_length": 162.20834159851074, "epoch": 0.04121504081359561, "grad_norm": 1.7075368165969849, "kl": 0.2482675537467003, "learning_rate": 4.3924876391293915e-06, "loss": 0.0099, "reward": 1.5780000388622284, "reward_std": 0.7741712592542171, "rewards/correctness_reward_func": 0.416666679084301, "rewards/int_reward_func": 0.3333333469927311, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.37500000558793545, "rewards/xmlcount_reward_func": 0.453000009059906, "step": 77 }, { "completion_length": 90.83333587646484, "epoch": 0.04175030108390205, "grad_norm": 1.605960726737976, "kl": 0.1952410712838173, "learning_rate": 4.36949522624633e-06, "loss": 0.0078, "reward": 1.7031250596046448, "reward_std": 0.5450708866119385, "rewards/correctness_reward_func": 0.6666666865348816, "rewards/int_reward_func": 0.2083333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3333333395421505, "rewards/xmlcount_reward_func": 0.4947916716337204, "step": 78 }, { "completion_length": 136.16666984558105, "epoch": 0.04228556135420848, "grad_norm": 0.673092246055603, "kl": 0.1753321774303913, "learning_rate": 4.346138351564711e-06, "loss": 0.007, "reward": 1.2291666716337204, "reward_std": 0.4924144148826599, "rewards/correctness_reward_func": 0.1666666716337204, "rewards/int_reward_func": 0.291666679084301, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.29166667349636555, "rewards/xmlcount_reward_func": 0.4791666716337204, "step": 79 }, { "completion_length": 71.20833683013916, "epoch": 0.04282082162451492, "grad_norm": 1.5928924083709717, "kl": 0.31612617522478104, "learning_rate": 4.322421568553529e-06, "loss": 0.0126, "reward": 2.0416666865348816, "reward_std": 0.20412416756153107, "rewards/correctness_reward_func": 0.5833333358168602, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 80 }, { "completion_length": 112.33333492279053, "epoch": 0.043356081894821354, "grad_norm": 0.4769633412361145, "kl": 0.21202785894274712, "learning_rate": 4.2983495008466285e-06, "loss": 0.0085, "reward": 1.7968750447034836, "reward_std": 0.15385404229164124, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.484375, "step": 81 }, { "completion_length": 175.41667366027832, "epoch": 0.04389134216512779, "grad_norm": 0.9674685597419739, "kl": 0.13749209698289633, "learning_rate": 4.273926841341303e-06, "loss": 0.0055, "reward": 1.6510417461395264, "reward_std": 0.5754482969641685, "rewards/correctness_reward_func": 0.416666679084301, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.33333334140479565, "rewards/xmlcount_reward_func": 0.484375, "step": 82 }, { "completion_length": 67.87500190734863, "epoch": 0.04442660243543423, "grad_norm": 1.0911532640457153, "kl": 0.28480928763747215, "learning_rate": 4.249158351283414e-06, "loss": 0.0114, "reward": 1.4166666865348816, "reward_std": 0.12909945845603943, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 83 }, { "completion_length": 71.91666889190674, "epoch": 0.044961862705740666, "grad_norm": 1.9787883758544922, "kl": 0.2760180849581957, "learning_rate": 4.224048859339175e-06, "loss": 0.011, "reward": 1.1250000596046448, "reward_std": 0.20412414520978928, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.25000000558793545, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.37500000558793545, "rewards/xmlcount_reward_func": 0.5, "step": 84 }, { "completion_length": 124.37500190734863, "epoch": 0.045497122976047105, "grad_norm": 0.5233182907104492, "kl": 0.20504293218255043, "learning_rate": 4.198603260653792e-06, "loss": 0.0082, "reward": 1.2343750298023224, "reward_std": 0.3668263405561447, "rewards/correctness_reward_func": 0.1666666716337204, "rewards/int_reward_func": 0.3541666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.25, "rewards/xmlcount_reward_func": 0.4635416716337204, "step": 85 }, { "completion_length": 101.62500381469727, "epoch": 0.04603238324635354, "grad_norm": 1.2110995054244995, "kl": 0.15592870488762856, "learning_rate": 4.172826515897146e-06, "loss": 0.0062, "reward": 1.7083334028720856, "reward_std": 0.5623037368059158, "rewards/correctness_reward_func": 0.5000000223517418, "rewards/int_reward_func": 0.2291666753590107, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 86 }, { "completion_length": 126.25000762939453, "epoch": 0.04656764351665998, "grad_norm": 0.7313621640205383, "kl": 0.18805699050426483, "learning_rate": 4.146723650296701e-06, "loss": 0.0075, "reward": 1.3646250218153, "reward_std": 0.3509002774953842, "rewards/correctness_reward_func": 0.3333333432674408, "rewards/int_reward_func": 0.14583333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.416666679084301, "rewards/xmlcount_reward_func": 0.4687916710972786, "step": 87 }, { "completion_length": 209.45833683013916, "epoch": 0.04710290378696641, "grad_norm": 0.725437343120575, "kl": 0.16482173651456833, "learning_rate": 4.120299752657828e-06, "loss": 0.0066, "reward": 1.046916663646698, "reward_std": 0.4301242418587208, "rewards/correctness_reward_func": 0.1666666716337204, "rewards/int_reward_func": 0.1875000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.2291666716337204, "rewards/xmlcount_reward_func": 0.4635833352804184, "step": 88 }, { "completion_length": 165.0833396911621, "epoch": 0.04763816405727285, "grad_norm": 0.7724325656890869, "kl": 0.20376655086874962, "learning_rate": 4.093559974371725e-06, "loss": 0.0082, "reward": 1.0260416865348816, "reward_std": 0.1994822435081005, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.1250000037252903, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.484375, "step": 89 }, { "completion_length": 98.66666984558105, "epoch": 0.04817342432757928, "grad_norm": 1.395373821258545, "kl": 0.2529403530061245, "learning_rate": 4.066509528411151e-06, "loss": 0.0101, "reward": 1.0780000239610672, "reward_std": 0.2602536380290985, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.1875000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333432674408, "rewards/xmlcount_reward_func": 0.4946666657924652, "step": 90 }, { "completion_length": 89.25, "epoch": 0.04870868459788572, "grad_norm": 0.8370155692100525, "kl": 0.19209491834044456, "learning_rate": 4.039153688314146e-06, "loss": 0.0077, "reward": 1.2864583879709244, "reward_std": 0.43756843730807304, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.0833333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.4947916716337204, "step": 91 }, { "completion_length": 63.333335876464844, "epoch": 0.04924394486819216, "grad_norm": 2.1854286193847656, "kl": 0.21549397706985474, "learning_rate": 4.011497787155938e-06, "loss": 0.0086, "reward": 1.2964583337306976, "reward_std": 0.1316254585981369, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.3541666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.48395833373069763, "step": 92 }, { "completion_length": 88.62500190734863, "epoch": 0.049779205138498595, "grad_norm": 1.6499943733215332, "kl": 0.2516642101109028, "learning_rate": 3.983547216509254e-06, "loss": 0.0101, "reward": 1.6613333523273468, "reward_std": 0.6053861007094383, "rewards/correctness_reward_func": 0.4166666716337204, "rewards/int_reward_func": 0.3541666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333358168602, "rewards/xmlcount_reward_func": 0.4946666657924652, "step": 93 }, { "completion_length": 134.12500381469727, "epoch": 0.050314465408805034, "grad_norm": 0.9040616154670715, "kl": 0.18027858808636665, "learning_rate": 3.955307425393224e-06, "loss": 0.0072, "reward": 1.1510417014360428, "reward_std": 0.4075661599636078, "rewards/correctness_reward_func": 0.1666666716337204, "rewards/int_reward_func": 0.125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3750000037252903, "rewards/xmlcount_reward_func": 0.484375, "step": 94 }, { "completion_length": 143.12500381469727, "epoch": 0.05084972567911147, "grad_norm": 0.8769562244415283, "kl": 0.2623859569430351, "learning_rate": 3.92678391921108e-06, "loss": 0.0105, "reward": 1.3593750298023224, "reward_std": 0.3879491835832596, "rewards/correctness_reward_func": 0.0833333358168602, "rewards/int_reward_func": 0.3958333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333358168602, "rewards/xmlcount_reward_func": 0.484375, "step": 95 }, { "completion_length": 80.58333587646484, "epoch": 0.051384985949417906, "grad_norm": 1.6260957717895508, "kl": 0.2531866990029812, "learning_rate": 3.897982258676867e-06, "loss": 0.0101, "reward": 1.3281250298023224, "reward_std": 0.37103308364748955, "rewards/correctness_reward_func": 0.0833333358168602, "rewards/int_reward_func": 0.33333334140479565, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.4739583432674408, "step": 96 }, { "completion_length": 104.91666889190674, "epoch": 0.05192024621972434, "grad_norm": 0.8191968202590942, "kl": 0.27327974885702133, "learning_rate": 3.868908058731376e-06, "loss": 0.0109, "reward": 1.1041666865348816, "reward_std": 0.16337091475725174, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.2500000037252903, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4791666716337204, "step": 97 }, { "completion_length": 62.00000190734863, "epoch": 0.05245550649003078, "grad_norm": 0.3426864743232727, "kl": 0.2767893858253956, "learning_rate": 3.839566987447492e-06, "loss": 0.0111, "reward": 1.3541666865348816, "reward_std": 0.05103103816509247, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 98 }, { "completion_length": 389.2500114440918, "epoch": 0.05299076676033721, "grad_norm": 1.0328341722488403, "kl": 0.1228889636695385, "learning_rate": 3.8099647649251984e-06, "loss": 0.0049, "reward": 1.067708358168602, "reward_std": 0.422527939081192, "rewards/correctness_reward_func": 0.0833333358168602, "rewards/int_reward_func": 0.27083333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.2916666716337204, "rewards/xmlcount_reward_func": 0.421875, "step": 99 }, { "completion_length": 73.62500286102295, "epoch": 0.05352602703064365, "grad_norm": 0.7198861241340637, "kl": 0.2295570969581604, "learning_rate": 3.780107162176429e-06, "loss": 0.0092, "reward": 1.375, "reward_std": 0.2885505259037018, "rewards/correctness_reward_func": 0.0833333358168602, "rewards/int_reward_func": 0.2916666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 100 }, { "completion_length": 247.083345413208, "epoch": 0.054061287300950084, "grad_norm": 0.8113954663276672, "kl": 0.2137942397966981, "learning_rate": 3.7500000000000005e-06, "loss": 0.0086, "reward": 1.0833750367164612, "reward_std": 0.4043814614415169, "rewards/correctness_reward_func": 0.0833333358168602, "rewards/int_reward_func": 0.29166667349636555, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.2916666716337204, "rewards/xmlcount_reward_func": 0.4167083315551281, "step": 101 }, { "completion_length": 161.08334159851074, "epoch": 0.05459654757125652, "grad_norm": 0.9631187915802002, "kl": 0.320892296731472, "learning_rate": 3.7196491478468322e-06, "loss": 0.0128, "reward": 1.7552083432674408, "reward_std": 0.20395417511463165, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.3541666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.484375, "step": 102 }, { "completion_length": 86.87500286102295, "epoch": 0.05513180784156296, "grad_norm": 0.5522407293319702, "kl": 0.19866621680557728, "learning_rate": 3.689060522675689e-06, "loss": 0.0079, "reward": 1.6974583566188812, "reward_std": 0.43488648533821106, "rewards/correctness_reward_func": 0.4166666865348816, "rewards/int_reward_func": 0.3333333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.48912499845027924, "step": 103 }, { "completion_length": 116.37500381469727, "epoch": 0.055667068111869396, "grad_norm": 0.39440569281578064, "kl": 0.2107317578047514, "learning_rate": 3.658240087799655e-06, "loss": 0.0084, "reward": 1.4635416865348816, "reward_std": 0.3243303596973419, "rewards/correctness_reward_func": 0.0833333358168602, "rewards/int_reward_func": 0.4375000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.484375, "step": 104 }, { "completion_length": 115.41666984558105, "epoch": 0.056202328382175835, "grad_norm": 1.1680481433868408, "kl": 0.4403987228870392, "learning_rate": 3.627193851723577e-06, "loss": 0.0176, "reward": 1.2552083730697632, "reward_std": 0.2559161148965359, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.3125000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.484375, "step": 105 }, { "completion_length": 69.83333396911621, "epoch": 0.05673758865248227, "grad_norm": 0.4940861463546753, "kl": 0.27071962505578995, "learning_rate": 3.595927866972694e-06, "loss": 0.0108, "reward": 1.1666666865348816, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.1666666679084301, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 106 }, { "completion_length": 94.45833587646484, "epoch": 0.05727284892278871, "grad_norm": 1.4608356952667236, "kl": 0.16904586926102638, "learning_rate": 3.564448228912682e-06, "loss": 0.0068, "reward": 1.4895833432674408, "reward_std": 0.4765794351696968, "rewards/correctness_reward_func": 0.1666666716337204, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.416666679084301, "rewards/xmlcount_reward_func": 0.4895833432674408, "step": 107 }, { "completion_length": 130.75000190734863, "epoch": 0.05780810919309514, "grad_norm": 1.187828779220581, "kl": 0.2347713652998209, "learning_rate": 3.532761074561355e-06, "loss": 0.0094, "reward": 1.0625000298023224, "reward_std": 0.11558076366782188, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.2500000037252903, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.5, "step": 108 }, { "completion_length": 81.08333587646484, "epoch": 0.05834336946340158, "grad_norm": 0.4293256103992462, "kl": 0.3195993173867464, "learning_rate": 3.5008725813922383e-06, "loss": 0.0128, "reward": 1.1666666865348816, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.1666666679084301, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 109 }, { "completion_length": 153.29166984558105, "epoch": 0.05887862973370801, "grad_norm": 1.0056191682815552, "kl": 0.20048995688557625, "learning_rate": 3.4687889661302577e-06, "loss": 0.008, "reward": 1.4843750298023224, "reward_std": 0.41315262764692307, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.37500000558793545, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.37500000558793545, "rewards/xmlcount_reward_func": 0.484375, "step": 110 }, { "completion_length": 88.54167175292969, "epoch": 0.05941389000401445, "grad_norm": 1.9036917686462402, "kl": 0.2402002513408661, "learning_rate": 3.436516483539781e-06, "loss": 0.0096, "reward": 1.833333358168602, "reward_std": 0.5183059275150299, "rewards/correctness_reward_func": 0.5000000223517418, "rewards/int_reward_func": 0.39583333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 111 }, { "completion_length": 73.37500190734863, "epoch": 0.059949150274320885, "grad_norm": 1.473577857017517, "kl": 0.31748900189995766, "learning_rate": 3.4040614252052305e-06, "loss": 0.0127, "reward": 1.3645833730697632, "reward_std": 0.3358423411846161, "rewards/correctness_reward_func": 0.0833333358168602, "rewards/int_reward_func": 0.3541666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.4895833358168602, "step": 112 }, { "completion_length": 75.04166793823242, "epoch": 0.060484410544627325, "grad_norm": 0.9336056709289551, "kl": 0.29590417072176933, "learning_rate": 3.3714301183045382e-06, "loss": 0.0118, "reward": 1.270833358168602, "reward_std": 0.1530931033194065, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 113 }, { "completion_length": 71.25000190734863, "epoch": 0.061019670814933764, "grad_norm": 1.1127337217330933, "kl": 0.2616447024047375, "learning_rate": 3.338628924375638e-06, "loss": 0.0105, "reward": 1.291666716337204, "reward_std": 0.16661179810762405, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.3125000149011612, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 114 }, { "completion_length": 75.50000286102295, "epoch": 0.0615549310852402, "grad_norm": 0.7082913517951965, "kl": 0.2341964803636074, "learning_rate": 3.3056642380762783e-06, "loss": 0.0094, "reward": 1.4791666865348816, "reward_std": 0.3881191611289978, "rewards/correctness_reward_func": 0.1666666716337204, "rewards/int_reward_func": 0.3333333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 115 }, { "completion_length": 206.91667556762695, "epoch": 0.06209019135554664, "grad_norm": 0.41269803047180176, "kl": 0.18453531339764595, "learning_rate": 3.272542485937369e-06, "loss": 0.0074, "reward": 1.5364583730697632, "reward_std": 0.7113124281167984, "rewards/correctness_reward_func": 0.3333333432674408, "rewards/int_reward_func": 0.4375000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125000037252903, "rewards/xmlcount_reward_func": 0.453125, "step": 116 }, { "completion_length": 88.50000381469727, "epoch": 0.06262545162585308, "grad_norm": 0.8284731507301331, "kl": 0.18175287544727325, "learning_rate": 3.2392701251101172e-06, "loss": 0.0073, "reward": 1.3958333730697632, "reward_std": 0.3092299550771713, "rewards/correctness_reward_func": 0.1666666716337204, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3541666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 117 }, { "completion_length": 151.66666984558105, "epoch": 0.0631607118961595, "grad_norm": 1.4958237409591675, "kl": 0.2391066513955593, "learning_rate": 3.205853642107192e-06, "loss": 0.0096, "reward": 1.1510416865348816, "reward_std": 0.5006890743970871, "rewards/correctness_reward_func": 0.1666666716337204, "rewards/int_reward_func": 0.22916666977107525, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.2708333432674408, "rewards/xmlcount_reward_func": 0.484375, "step": 118 }, { "completion_length": 123.37500381469727, "epoch": 0.06369597216646594, "grad_norm": 0.7593151926994324, "kl": 0.23146136105060577, "learning_rate": 3.1722995515381644e-06, "loss": 0.0093, "reward": 1.4270833432674408, "reward_std": 0.48038505017757416, "rewards/correctness_reward_func": 0.1666666716337204, "rewards/int_reward_func": 0.3750000037252903, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.46875, "step": 119 }, { "completion_length": 123.75000762939453, "epoch": 0.06423123243677238, "grad_norm": 0.3002820909023285, "kl": 0.18683998472988605, "learning_rate": 3.1386143948394764e-06, "loss": 0.0075, "reward": 1.3802083730697632, "reward_std": 0.2934284619987011, "rewards/correctness_reward_func": 0.4166666865348816, "rewards/int_reward_func": 0.1041666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.484375, "step": 120 }, { "completion_length": 71.62500190734863, "epoch": 0.06476649270707882, "grad_norm": 0.894112229347229, "kl": 0.2145114541053772, "learning_rate": 3.1048047389991693e-06, "loss": 0.0086, "reward": 1.2291666865348816, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 121 }, { "completion_length": 77.50000095367432, "epoch": 0.06530175297738525, "grad_norm": 0.9729853272438049, "kl": 0.25685518980026245, "learning_rate": 3.0708771752766397e-06, "loss": 0.0103, "reward": 1.645833358168602, "reward_std": 0.25515517219901085, "rewards/correctness_reward_func": 0.4166666865348816, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 122 }, { "completion_length": 115.25000381469727, "epoch": 0.06583701324769169, "grad_norm": 0.60560142993927, "kl": 0.26780444383621216, "learning_rate": 3.0368383179176584e-06, "loss": 0.0107, "reward": 1.3333333432674408, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 123 }, { "completion_length": 65.25000190734863, "epoch": 0.06637227351799813, "grad_norm": 0.47688770294189453, "kl": 0.2243332415819168, "learning_rate": 3.002694802864912e-06, "loss": 0.009, "reward": 1.2291666865348816, "reward_std": 0.05103103816509247, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 124 }, { "completion_length": 349.833345413208, "epoch": 0.06690753378830457, "grad_norm": 0.6750498414039612, "kl": 0.27263053273782134, "learning_rate": 2.9684532864643123e-06, "loss": 0.0109, "reward": 1.5260833650827408, "reward_std": 0.4318152070045471, "rewards/correctness_reward_func": 0.4166666865348816, "rewards/int_reward_func": 0.3333333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3541666716337204, "rewards/xmlcount_reward_func": 0.421916663646698, "step": 125 }, { "completion_length": 186.62500762939453, "epoch": 0.067442794058611, "grad_norm": 0.8187605142593384, "kl": 0.2676307410001755, "learning_rate": 2.9341204441673267e-06, "loss": 0.0107, "reward": 1.55266672372818, "reward_std": 0.4699760675430298, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.44849999994039536, "step": 126 }, { "completion_length": 72.50000190734863, "epoch": 0.06797805432891743, "grad_norm": 0.06586437672376633, "kl": 0.25656602531671524, "learning_rate": 2.8997029692295875e-06, "loss": 0.0103, "reward": 1.375, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 127 }, { "completion_length": 67.66666889190674, "epoch": 0.06851331459922387, "grad_norm": 0.13435645401477814, "kl": 0.23636912554502487, "learning_rate": 2.8652075714060296e-06, "loss": 0.0095, "reward": 1.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 128 }, { "completion_length": 71.12500381469727, "epoch": 0.06904857486953031, "grad_norm": 0.6717380285263062, "kl": 0.3070458807051182, "learning_rate": 2.8306409756428067e-06, "loss": 0.0123, "reward": 1.4375000298023224, "reward_std": 0.11558076366782188, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 129 }, { "completion_length": 73.16666889190674, "epoch": 0.06958383513983675, "grad_norm": 0.9400418996810913, "kl": 0.22653049230575562, "learning_rate": 2.7960099207662535e-06, "loss": 0.0091, "reward": 2.0000000596046448, "reward_std": 0.556085180491209, "rewards/correctness_reward_func": 0.6666666865348816, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 130 }, { "completion_length": 71.66666889190674, "epoch": 0.07011909541014318, "grad_norm": 0.6696528792381287, "kl": 0.26699281856417656, "learning_rate": 2.761321158169134e-06, "loss": 0.0107, "reward": 1.3958333730697632, "reward_std": 0.25515518337488174, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000149011612, "rewards/xmlcount_reward_func": 0.5, "step": 131 }, { "completion_length": 98.91666984558105, "epoch": 0.07065435568044962, "grad_norm": 0.8539110422134399, "kl": 0.23305394127964973, "learning_rate": 2.726581450494451e-06, "loss": 0.0093, "reward": 1.1562500298023224, "reward_std": 0.22562336921691895, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.2916666679084301, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3750000037252903, "rewards/xmlcount_reward_func": 0.4895833358168602, "step": 132 }, { "completion_length": 76.66666984558105, "epoch": 0.07118961595075605, "grad_norm": 0.2167089283466339, "kl": 0.20909808576107025, "learning_rate": 2.6917975703170466e-06, "loss": 0.0084, "reward": 1.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 133 }, { "completion_length": 69.50000190734863, "epoch": 0.0717248762210625, "grad_norm": 1.07498037815094, "kl": 0.2273651361465454, "learning_rate": 2.6569762988232838e-06, "loss": 0.0091, "reward": 1.2291666865348816, "reward_std": 0.11558076366782188, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.22916666977107525, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 134 }, { "completion_length": 78.87500190734863, "epoch": 0.07226013649136893, "grad_norm": 1.082901954650879, "kl": 0.21143031865358353, "learning_rate": 2.6221244244890336e-06, "loss": 0.0085, "reward": 1.2691666781902313, "reward_std": 0.16395629942417145, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.4775000065565109, "step": 135 }, { "completion_length": 84.79166793823242, "epoch": 0.07279539676167536, "grad_norm": 0.5148155093193054, "kl": 0.19925828650593758, "learning_rate": 2.587248741756253e-06, "loss": 0.008, "reward": 1.375, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 136 }, { "completion_length": 102.62500190734863, "epoch": 0.0733306570319818, "grad_norm": 0.6643544435501099, "kl": 0.1562279723584652, "learning_rate": 2.5523560497083927e-06, "loss": 0.0062, "reward": 1.248750001192093, "reward_std": 0.2717357352375984, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.3333333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000149011612, "rewards/xmlcount_reward_func": 0.4779166728258133, "step": 137 }, { "completion_length": 67.0416669845581, "epoch": 0.07386591730228824, "grad_norm": 0.4721544086933136, "kl": 0.19591450318694115, "learning_rate": 2.517453150744904e-06, "loss": 0.0078, "reward": 1.3541666865348816, "reward_std": 0.05103103816509247, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 138 }, { "completion_length": 79.37500190734863, "epoch": 0.07440117757259468, "grad_norm": 0.4948488771915436, "kl": 0.20949136465787888, "learning_rate": 2.482546849255096e-06, "loss": 0.0084, "reward": 1.5416666865348816, "reward_std": 0.25819891691207886, "rewards/correctness_reward_func": 0.1666666716337204, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 139 }, { "completion_length": 64.29166889190674, "epoch": 0.0749364378429011, "grad_norm": 0.8793404698371887, "kl": 0.22966529056429863, "learning_rate": 2.447643950291608e-06, "loss": 0.0092, "reward": 1.3541666865348816, "reward_std": 0.25515519082546234, "rewards/correctness_reward_func": 0.0833333358168602, "rewards/int_reward_func": 0.27083333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 140 }, { "completion_length": 139.62500190734863, "epoch": 0.07547169811320754, "grad_norm": 0.5459631681442261, "kl": 0.19374394416809082, "learning_rate": 2.4127512582437486e-06, "loss": 0.0077, "reward": 1.177083358168602, "reward_std": 0.18890930339694023, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.2916666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666679084301, "rewards/xmlcount_reward_func": 0.46875, "step": 141 }, { "completion_length": 153.16666984558105, "epoch": 0.07600695838351398, "grad_norm": 0.9452311396598816, "kl": 0.1499454267323017, "learning_rate": 2.377875575510967e-06, "loss": 0.006, "reward": 1.7239583730697632, "reward_std": 0.8880488127470016, "rewards/correctness_reward_func": 0.666666679084301, "rewards/int_reward_func": 0.3958333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.25, "rewards/xmlcount_reward_func": 0.4114583358168602, "step": 142 }, { "completion_length": 61.29166793823242, "epoch": 0.07654221865382042, "grad_norm": 0.09438279271125793, "kl": 0.3462410867214203, "learning_rate": 2.3430237011767166e-06, "loss": 0.0138, "reward": 1.5, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 143 }, { "completion_length": 88.37500286102295, "epoch": 0.07707747892412686, "grad_norm": 1.7993136644363403, "kl": 0.32219041138887405, "learning_rate": 2.3082024296829538e-06, "loss": 0.0129, "reward": 1.4218750596046448, "reward_std": 0.4565740302205086, "rewards/correctness_reward_func": 0.1666666716337204, "rewards/int_reward_func": 0.37500000558793545, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333358168602, "rewards/xmlcount_reward_func": 0.484375, "step": 144 }, { "completion_length": 72.62500381469727, "epoch": 0.07761273919443329, "grad_norm": 0.09042877703905106, "kl": 0.20896168053150177, "learning_rate": 2.2734185495055503e-06, "loss": 0.0084, "reward": 1.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 145 }, { "completion_length": 163.75000190734863, "epoch": 0.07814799946473973, "grad_norm": 1.1228388547897339, "kl": 0.21238887682557106, "learning_rate": 2.238678841830867e-06, "loss": 0.0085, "reward": 1.1927083730697632, "reward_std": 0.2637527585029602, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.3333333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3750000074505806, "rewards/xmlcount_reward_func": 0.484375, "step": 146 }, { "completion_length": 71.95833683013916, "epoch": 0.07868325973504617, "grad_norm": 0.661372721195221, "kl": 0.3126152493059635, "learning_rate": 2.2039900792337477e-06, "loss": 0.0125, "reward": 1.4375, "reward_std": 0.22008520364761353, "rewards/correctness_reward_func": 0.0833333358168602, "rewards/int_reward_func": 0.39583333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 147 }, { "completion_length": 72.33333492279053, "epoch": 0.07921852000535261, "grad_norm": 0.6126328706741333, "kl": 0.18934645876288414, "learning_rate": 2.1693590243571937e-06, "loss": 0.0076, "reward": 1.8333333432674408, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func": 0.5833333358168602, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 148 }, { "completion_length": 103.37500381469727, "epoch": 0.07975378027565903, "grad_norm": 0.8722951412200928, "kl": 0.14480971172451973, "learning_rate": 2.134792428593971e-06, "loss": 0.0058, "reward": 1.895833432674408, "reward_std": 0.5449064522981644, "rewards/correctness_reward_func": 0.5000000149011612, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 149 }, { "completion_length": 69.54166984558105, "epoch": 0.08028904054596547, "grad_norm": 0.7660404443740845, "kl": 0.20250581949949265, "learning_rate": 2.1002970307704134e-06, "loss": 0.0081, "reward": 1.3958333432674408, "reward_std": 0.35721728205680847, "rewards/correctness_reward_func": 0.1666666716337204, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 150 }, { "completion_length": 180.7083396911621, "epoch": 0.08082430081627191, "grad_norm": 0.5615190267562866, "kl": 0.23339027352631092, "learning_rate": 2.0658795558326745e-06, "loss": 0.0093, "reward": 1.5729166865348816, "reward_std": 0.31237732619047165, "rewards/correctness_reward_func": 0.4166666865348816, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46875, "step": 151 }, { "completion_length": 63.83333396911621, "epoch": 0.08135956108657835, "grad_norm": 0.1392926722764969, "kl": 0.26523152738809586, "learning_rate": 2.031546713535688e-06, "loss": 0.0106, "reward": 1.375, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 152 }, { "completion_length": 81.08333778381348, "epoch": 0.08189482135688479, "grad_norm": 1.1122794151306152, "kl": 0.25071796402335167, "learning_rate": 1.997305197135089e-06, "loss": 0.01, "reward": 1.5200416892766953, "reward_std": 0.3365423232316971, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.1875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.354166679084301, "rewards/xmlcount_reward_func": 0.47837500274181366, "step": 153 }, { "completion_length": 93.37500381469727, "epoch": 0.08243008162719122, "grad_norm": 1.3372734785079956, "kl": 0.19058941677212715, "learning_rate": 1.963161682082342e-06, "loss": 0.0076, "reward": 1.6250000298023224, "reward_std": 0.47279806435108185, "rewards/correctness_reward_func": 0.5000000223517418, "rewards/int_reward_func": 0.22916666977107525, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.39583333395421505, "rewards/xmlcount_reward_func": 0.5, "step": 154 }, { "completion_length": 104.83333396911621, "epoch": 0.08296534189749766, "grad_norm": 2.2292211055755615, "kl": 0.16792716644704342, "learning_rate": 1.9291228247233607e-06, "loss": 0.0067, "reward": 2.0000000596046448, "reward_std": 0.4289814233779907, "rewards/correctness_reward_func": 0.583333358168602, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 155 }, { "completion_length": 90.87500476837158, "epoch": 0.0835006021678041, "grad_norm": 0.4122345745563507, "kl": 0.18274019937962294, "learning_rate": 1.895195261000831e-06, "loss": 0.0073, "reward": 1.2708333432674408, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.27083333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 156 }, { "completion_length": 172.45833587646484, "epoch": 0.08403586243811054, "grad_norm": 0.8472205400466919, "kl": 0.2094765491783619, "learning_rate": 1.8613856051605242e-06, "loss": 0.0084, "reward": 1.2343750298023224, "reward_std": 0.2048850804567337, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.3541666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333395421505, "rewards/xmlcount_reward_func": 0.484375, "step": 157 }, { "completion_length": 127.00000762939453, "epoch": 0.08457112270841696, "grad_norm": 0.6804733276367188, "kl": 0.20304612442851067, "learning_rate": 1.827700448461836e-06, "loss": 0.0081, "reward": 1.192708358168602, "reward_std": 0.17030073329806328, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.25000000558793545, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.484375, "step": 158 }, { "completion_length": 85.29166793823242, "epoch": 0.0851063829787234, "grad_norm": 0.7838725447654724, "kl": 0.29594049230217934, "learning_rate": 1.7941463578928088e-06, "loss": 0.0118, "reward": 1.2708333432674408, "reward_std": 0.16913224011659622, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.3541666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.4791666716337204, "step": 159 }, { "completion_length": 126.00000381469727, "epoch": 0.08564164324902984, "grad_norm": 1.384969711303711, "kl": 0.16587615385651588, "learning_rate": 1.7607298748898844e-06, "loss": 0.0066, "reward": 1.4843750149011612, "reward_std": 0.5336930006742477, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.2916666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.484375, "step": 160 }, { "completion_length": 72.12500286102295, "epoch": 0.08617690351933628, "grad_norm": 0.0433061420917511, "kl": 0.1850288063287735, "learning_rate": 1.7274575140626318e-06, "loss": 0.0074, "reward": 1.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 161 }, { "completion_length": 174.58334159851074, "epoch": 0.08671216378964271, "grad_norm": 0.8271235823631287, "kl": 0.18862449377775192, "learning_rate": 1.6943357619237227e-06, "loss": 0.0075, "reward": 1.2083333432674408, "reward_std": 0.30994437262415886, "rewards/correctness_reward_func": 0.0833333358168602, "rewards/int_reward_func": 0.3333333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3333333432674408, "rewards/xmlcount_reward_func": 0.4583333358168602, "step": 162 }, { "completion_length": 78.20833587646484, "epoch": 0.08724742405994915, "grad_norm": 0.908470869064331, "kl": 0.1706334725022316, "learning_rate": 1.661371075624363e-06, "loss": 0.0068, "reward": 1.2708333432674408, "reward_std": 0.33958156406879425, "rewards/correctness_reward_func": 0.0833333358168602, "rewards/int_reward_func": 0.27083333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 163 }, { "completion_length": 137.91666984558105, "epoch": 0.08778268433025559, "grad_norm": 0.7333254814147949, "kl": 0.1247784998267889, "learning_rate": 1.6285698816954626e-06, "loss": 0.005, "reward": 1.5208333432674408, "reward_std": 0.767971470952034, "rewards/correctness_reward_func": 0.3333333432674408, "rewards/int_reward_func": 0.29166667349636555, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.39583333395421505, "rewards/xmlcount_reward_func": 0.5, "step": 164 }, { "completion_length": 78.08333683013916, "epoch": 0.08831794460056203, "grad_norm": 0.8487056493759155, "kl": 0.22962494008243084, "learning_rate": 1.5959385747947697e-06, "loss": 0.0092, "reward": 1.2916666865348816, "reward_std": 0.14360667020082474, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.31250000186264515, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 165 }, { "completion_length": 200.79166793823242, "epoch": 0.08885320487086847, "grad_norm": 1.4497365951538086, "kl": 0.1650528460741043, "learning_rate": 1.56348351646022e-06, "loss": 0.0066, "reward": 1.4895834028720856, "reward_std": 0.5747665874660015, "rewards/correctness_reward_func": 0.1666666716337204, "rewards/int_reward_func": 0.4375000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.46875, "step": 166 }, { "completion_length": 176.12500762939453, "epoch": 0.08938846514117489, "grad_norm": 0.4245186746120453, "kl": 0.15275901928544044, "learning_rate": 1.5312110338697427e-06, "loss": 0.0061, "reward": 1.1822916865348816, "reward_std": 0.3088150769472122, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.3125000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.453125, "step": 167 }, { "completion_length": 312.9583396911621, "epoch": 0.08992372541148133, "grad_norm": 0.7641220092773438, "kl": 0.14276206120848656, "learning_rate": 1.4991274186077632e-06, "loss": 0.0057, "reward": 1.166666716337204, "reward_std": 0.49721667170524597, "rewards/correctness_reward_func": 0.0833333358168602, "rewards/int_reward_func": 0.22916666977107525, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666679084301, "rewards/xmlcount_reward_func": 0.4375, "step": 168 }, { "completion_length": 101.45833778381348, "epoch": 0.09045898568178777, "grad_norm": 0.7765936255455017, "kl": 0.1949683390557766, "learning_rate": 1.467238925438646e-06, "loss": 0.0078, "reward": 1.2916666865348816, "reward_std": 0.28610818088054657, "rewards/correctness_reward_func": 0.0833333358168602, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333395421505, "rewards/xmlcount_reward_func": 0.5, "step": 169 }, { "completion_length": 172.45833778381348, "epoch": 0.09099424595209421, "grad_norm": 0.41969409584999084, "kl": 0.13137296214699745, "learning_rate": 1.4355517710873184e-06, "loss": 0.0053, "reward": 2.114583343267441, "reward_std": 0.5827288627624512, "rewards/correctness_reward_func": 0.8333333358168602, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3541666716337204, "rewards/xmlcount_reward_func": 0.46875, "step": 170 }, { "completion_length": 79.95833587646484, "epoch": 0.09152950622240064, "grad_norm": 0.16617827117443085, "kl": 0.16961714625358582, "learning_rate": 1.4040721330273063e-06, "loss": 0.0068, "reward": 1.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 171 }, { "completion_length": 80.87500190734863, "epoch": 0.09206476649270708, "grad_norm": 0.36631259322166443, "kl": 0.1460169106721878, "learning_rate": 1.3728061482764238e-06, "loss": 0.0058, "reward": 1.3333333432674408, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.3333333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 172 }, { "completion_length": 97.12500190734863, "epoch": 0.09260002676301352, "grad_norm": 1.1213421821594238, "kl": 0.14688214287161827, "learning_rate": 1.3417599122003464e-06, "loss": 0.0059, "reward": 1.2291666865348816, "reward_std": 0.05103103816509247, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.2291666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 173 }, { "completion_length": 85.58333587646484, "epoch": 0.09313528703331996, "grad_norm": 0.7063998579978943, "kl": 0.17591003328561783, "learning_rate": 1.3109394773243117e-06, "loss": 0.007, "reward": 1.3958333730697632, "reward_std": 0.11558076739311218, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.3958333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 174 }, { "completion_length": 89.04166793823242, "epoch": 0.0936705473036264, "grad_norm": 0.7872856855392456, "kl": 0.17240377515554428, "learning_rate": 1.280350852153168e-06, "loss": 0.0069, "reward": 1.2500000447034836, "reward_std": 0.2728445753455162, "rewards/correctness_reward_func": 0.0833333358168602, "rewards/int_reward_func": 0.2083333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 175 }, { "completion_length": 252.00000762939453, "epoch": 0.09420580757393282, "grad_norm": 1.0295542478561401, "kl": 0.20453453436493874, "learning_rate": 1.2500000000000007e-06, "loss": 0.0082, "reward": 1.3854167014360428, "reward_std": 0.5866826139390469, "rewards/correctness_reward_func": 0.1666666716337204, "rewards/int_reward_func": 0.3958333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.35416666977107525, "rewards/xmlcount_reward_func": 0.46875, "step": 176 }, { "completion_length": 147.75000381469727, "epoch": 0.09474106784423926, "grad_norm": 0.834882915019989, "kl": 0.134497981518507, "learning_rate": 1.2198928378235717e-06, "loss": 0.0054, "reward": 1.4427083730697632, "reward_std": 0.5198761932551861, "rewards/correctness_reward_func": 0.1666666716337204, "rewards/int_reward_func": 0.37500000558793545, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.484375, "step": 177 }, { "completion_length": 100.08333778381348, "epoch": 0.0952763281145457, "grad_norm": 1.221563458442688, "kl": 0.16201673820614815, "learning_rate": 1.1900352350748026e-06, "loss": 0.0065, "reward": 1.338541716337204, "reward_std": 0.2559161148965359, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.3958333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.484375, "step": 178 }, { "completion_length": 127.83333969116211, "epoch": 0.09581158838485214, "grad_norm": 0.886989176273346, "kl": 0.15441275481134653, "learning_rate": 1.160433012552508e-06, "loss": 0.0062, "reward": 0.9375000149011612, "reward_std": 0.18744874745607376, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3750000037252903, "rewards/xmlcount_reward_func": 0.5, "step": 179 }, { "completion_length": 86.87500190734863, "epoch": 0.09634684865515857, "grad_norm": 1.1908172369003296, "kl": 0.15353485196828842, "learning_rate": 1.1310919412686248e-06, "loss": 0.0061, "reward": 1.0833333432674408, "reward_std": 0.10206207260489464, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 180 }, { "completion_length": 89.58333587646484, "epoch": 0.096882108925465, "grad_norm": 1.44786536693573, "kl": 0.1816324070096016, "learning_rate": 1.1020177413231334e-06, "loss": 0.0073, "reward": 1.510416716337204, "reward_std": 0.472514558583498, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.3750000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333358168602, "rewards/xmlcount_reward_func": 0.4895833358168602, "step": 181 }, { "completion_length": 82.58333778381348, "epoch": 0.09741736919577144, "grad_norm": 0.13121654093265533, "kl": 0.18894518539309502, "learning_rate": 1.073216080788921e-06, "loss": 0.0076, "reward": 1.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 182 }, { "completion_length": 221.0000123977661, "epoch": 0.09795262946607788, "grad_norm": 1.4435704946517944, "kl": 0.18115888815373182, "learning_rate": 1.0446925746067768e-06, "loss": 0.0072, "reward": 1.1875, "reward_std": 0.28912585973739624, "rewards/correctness_reward_func": 0.0833333358168602, "rewards/int_reward_func": 0.2916666679084301, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4375, "step": 183 }, { "completion_length": 125.9583387374878, "epoch": 0.09848788973638432, "grad_norm": 1.2450430393218994, "kl": 0.14993033185601234, "learning_rate": 1.0164527834907468e-06, "loss": 0.006, "reward": 1.6093750596046448, "reward_std": 0.7003048211336136, "rewards/correctness_reward_func": 0.583333358168602, "rewards/int_reward_func": 0.1875000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3541666716337204, "rewards/xmlcount_reward_func": 0.484375, "step": 184 }, { "completion_length": 250.50000381469727, "epoch": 0.09902315000669075, "grad_norm": 0.6676502823829651, "kl": 0.14374011009931564, "learning_rate": 9.88502212844063e-07, "loss": 0.0058, "reward": 1.6502083837985992, "reward_std": 0.4829741967841983, "rewards/correctness_reward_func": 0.6666666716337204, "rewards/int_reward_func": 0.2916666679084301, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.2708333395421505, "rewards/xmlcount_reward_func": 0.42104167491197586, "step": 185 }, { "completion_length": 95.41666984558105, "epoch": 0.09955841027699719, "grad_norm": 0.14050611853599548, "kl": 0.19362322241067886, "learning_rate": 9.608463116858544e-07, "loss": 0.0077, "reward": 1.5, "reward_std": 0.0, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.5, "step": 186 }, { "completion_length": 69.25000190734863, "epoch": 0.10009367054730363, "grad_norm": 0.7105045318603516, "kl": 0.22476506605744362, "learning_rate": 9.334904715888496e-07, "loss": 0.009, "reward": 1.2291666865348816, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.2291666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 187 }, { "completion_length": 81.08333778381348, "epoch": 0.10062893081761007, "grad_norm": 1.2536767721176147, "kl": 0.20191873610019684, "learning_rate": 9.064400256282757e-07, "loss": 0.0081, "reward": 1.0000000298023224, "reward_std": 0.20479072630405426, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.10416666977107525, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.4791666716337204, "step": 188 }, { "completion_length": 79.95833587646484, "epoch": 0.1011641910879165, "grad_norm": 0.1358516663312912, "kl": 0.2150093950331211, "learning_rate": 8.797002473421729e-07, "loss": 0.0086, "reward": 1.375, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 189 }, { "completion_length": 255.83334350585938, "epoch": 0.10169945135822293, "grad_norm": 0.8230968713760376, "kl": 0.1948665827512741, "learning_rate": 8.532763497032987e-07, "loss": 0.0078, "reward": 1.1041666865348816, "reward_std": 0.3931647092103958, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.2500000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.4375, "step": 190 }, { "completion_length": 72.45833492279053, "epoch": 0.10223471162852937, "grad_norm": 0.6617278456687927, "kl": 0.17371252551674843, "learning_rate": 8.271734841028553e-07, "loss": 0.0069, "reward": 1.4166666865348816, "reward_std": 0.4518480896949768, "rewards/correctness_reward_func": 0.2500000074505806, "rewards/int_reward_func": 0.1875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 191 }, { "completion_length": 92.29166889190674, "epoch": 0.10276997189883581, "grad_norm": 0.7337960600852966, "kl": 0.18250929936766624, "learning_rate": 8.013967393462094e-07, "loss": 0.0073, "reward": 1.4375000596046448, "reward_std": 0.4927079305052757, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.2708333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 192 }, { "completion_length": 93.79166984558105, "epoch": 0.10330523216914224, "grad_norm": 1.3416187763214111, "kl": 0.19713782332837582, "learning_rate": 7.759511406608255e-07, "loss": 0.0079, "reward": 1.5156250596046448, "reward_std": 0.5118480771780014, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.37500000558793545, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333432674408, "rewards/xmlcount_reward_func": 0.4947916716337204, "step": 193 }, { "completion_length": 96.08333969116211, "epoch": 0.10384049243944868, "grad_norm": 0.9250803589820862, "kl": 0.21085454896092415, "learning_rate": 7.508416487165862e-07, "loss": 0.0084, "reward": 1.8125000596046448, "reward_std": 0.31970491632819176, "rewards/correctness_reward_func": 0.4166666865348816, "rewards/int_reward_func": 0.4166666679084301, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 194 }, { "completion_length": 74.00000190734863, "epoch": 0.10437575270975512, "grad_norm": 1.441701889038086, "kl": 0.18313675373792648, "learning_rate": 7.260731586586983e-07, "loss": 0.0073, "reward": 1.7500000298023224, "reward_std": 0.3624359965324402, "rewards/correctness_reward_func": 0.3333333432674408, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333432674408, "rewards/xmlcount_reward_func": 0.5, "step": 195 }, { "completion_length": 116.54167175292969, "epoch": 0.10491101298006156, "grad_norm": 1.0711389780044556, "kl": 0.13384228572249413, "learning_rate": 7.016504991533727e-07, "loss": 0.0054, "reward": 1.1666667014360428, "reward_std": 0.16661180183291435, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.27083333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333395421505, "rewards/xmlcount_reward_func": 0.5, "step": 196 }, { "completion_length": 78.83333587646484, "epoch": 0.105446273250368, "grad_norm": 1.2510522603988647, "kl": 0.18244327045977116, "learning_rate": 6.775784314464717e-07, "loss": 0.0073, "reward": 1.0781250298023224, "reward_std": 0.17936956882476807, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.10416666977107525, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.4947916716337204, "step": 197 }, { "completion_length": 201.83333492279053, "epoch": 0.10598153352067442, "grad_norm": 1.4353007078170776, "kl": 0.17845631763339043, "learning_rate": 6.538616484352902e-07, "loss": 0.0071, "reward": 1.4947916865348816, "reward_std": 0.45137757435441017, "rewards/correctness_reward_func": 0.3333333432674408, "rewards/int_reward_func": 0.33333334140479565, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.453125, "step": 198 }, { "completion_length": 315.6666736602783, "epoch": 0.10651679379098086, "grad_norm": 0.7709922790527344, "kl": 0.15007262770086527, "learning_rate": 6.305047737536707e-07, "loss": 0.006, "reward": 1.3333333730697632, "reward_std": 0.43266693875193596, "rewards/correctness_reward_func": 0.0833333358168602, "rewards/int_reward_func": 0.4166666679084301, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333395421505, "rewards/xmlcount_reward_func": 0.4375, "step": 199 }, { "completion_length": 226.25001335144043, "epoch": 0.1070520540612873, "grad_norm": 0.4058845043182373, "kl": 0.24160834029316902, "learning_rate": 6.075123608706093e-07, "loss": 0.0097, "reward": 1.6354166865348816, "reward_std": 0.6109069883823395, "rewards/correctness_reward_func": 0.3333333358168602, "rewards/int_reward_func": 0.3958333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46875, "step": 200 }, { "completion_length": 78.66666793823242, "epoch": 0.10758731433159374, "grad_norm": 0.5436657071113586, "kl": 0.16075460240244865, "learning_rate": 5.848888922025553e-07, "loss": 0.0064, "reward": 1.4375, "reward_std": 0.22008520364761353, "rewards/correctness_reward_func": 0.0833333358168602, "rewards/int_reward_func": 0.3541666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 201 }, { "completion_length": 82.58333396911621, "epoch": 0.10812257460190017, "grad_norm": 0.7532062530517578, "kl": 0.1826519127935171, "learning_rate": 5.626387782395512e-07, "loss": 0.0073, "reward": 1.7916666865348816, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func": 0.4166666865348816, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 202 }, { "completion_length": 79.16666984558105, "epoch": 0.10865783487220661, "grad_norm": 1.078783631324768, "kl": 0.1677638739347458, "learning_rate": 5.407663566854008e-07, "loss": 0.0067, "reward": 1.4375000298023224, "reward_std": 0.31970491632819176, "rewards/correctness_reward_func": 0.0833333358168602, "rewards/int_reward_func": 0.35416666977107525, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 203 }, { "completion_length": 92.33333396911621, "epoch": 0.10919309514251305, "grad_norm": 0.8154336214065552, "kl": 0.18606754019856453, "learning_rate": 5.192758916120236e-07, "loss": 0.0074, "reward": 1.9375000298023224, "reward_std": 0.44672293961048126, "rewards/correctness_reward_func": 0.5000000223517418, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 204 }, { "completion_length": 215.95834159851074, "epoch": 0.10972835541281949, "grad_norm": 0.8592817783355713, "kl": 0.15069226268678904, "learning_rate": 4.981715726281666e-07, "loss": 0.006, "reward": 1.2552083730697632, "reward_std": 0.21166006475687027, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.3958333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.39583333395421505, "rewards/xmlcount_reward_func": 0.4635416716337204, "step": 205 }, { "completion_length": 160.62500381469727, "epoch": 0.11026361568312593, "grad_norm": 1.0854992866516113, "kl": 0.1672863345593214, "learning_rate": 4.774575140626317e-07, "loss": 0.0067, "reward": 1.3802083730697632, "reward_std": 0.4717924892902374, "rewards/correctness_reward_func": 0.0833333358168602, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333358168602, "rewards/xmlcount_reward_func": 0.484375, "step": 206 }, { "completion_length": 85.62500190734863, "epoch": 0.11079887595343235, "grad_norm": 2.937671184539795, "kl": 0.22150231339037418, "learning_rate": 4.5713775416217884e-07, "loss": 0.0089, "reward": 1.1458333730697632, "reward_std": 0.3955717794597149, "rewards/correctness_reward_func": 0.0833333358168602, "rewards/int_reward_func": 0.18750000186264515, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3750000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 207 }, { "completion_length": 86.54166889190674, "epoch": 0.11133413622373879, "grad_norm": 0.791983962059021, "kl": 0.2634577229619026, "learning_rate": 4.372162543042624e-07, "loss": 0.0105, "reward": 1.3541667014360428, "reward_std": 0.31970490142703056, "rewards/correctness_reward_func": 0.0833333358168602, "rewards/int_reward_func": 0.3541666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 208 }, { "completion_length": 103.04166984558105, "epoch": 0.11186939649404523, "grad_norm": 0.7941485643386841, "kl": 0.15498985722661018, "learning_rate": 4.1769689822475147e-07, "loss": 0.0062, "reward": 1.2343750298023224, "reward_std": 0.3260645717382431, "rewards/correctness_reward_func": 0.0833333358168602, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.25000000186264515, "rewards/xmlcount_reward_func": 0.4635416716337204, "step": 209 }, { "completion_length": 126.79166984558105, "epoch": 0.11240465676435167, "grad_norm": 1.0149474143981934, "kl": 0.12467027455568314, "learning_rate": 3.9858349126078945e-07, "loss": 0.005, "reward": 1.7271667420864105, "reward_std": 0.5239234380424023, "rewards/correctness_reward_func": 0.4166666865348816, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.416666679084301, "rewards/xmlcount_reward_func": 0.47716666758060455, "step": 210 }, { "completion_length": 85.79166984558105, "epoch": 0.1129399170346581, "grad_norm": 0.8396437764167786, "kl": 0.18657264113426208, "learning_rate": 3.798797596089351e-07, "loss": 0.0075, "reward": 1.5000000298023224, "reward_std": 0.32274864614009857, "rewards/correctness_reward_func": 0.1666666716337204, "rewards/int_reward_func": 0.3333333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 211 }, { "completion_length": 108.00000381469727, "epoch": 0.11347517730496454, "grad_norm": 0.934079110622406, "kl": 0.1696683205664158, "learning_rate": 3.615893495987335e-07, "loss": 0.0068, "reward": 1.8125000298023224, "reward_std": 0.6319277845323086, "rewards/correctness_reward_func": 0.5000000149011612, "rewards/int_reward_func": 0.3958333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.416666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 212 }, { "completion_length": 193.0416669845581, "epoch": 0.11401043757527098, "grad_norm": 0.6677061319351196, "kl": 0.1520245149731636, "learning_rate": 3.4371582698185636e-07, "loss": 0.0061, "reward": 1.713541716337204, "reward_std": 0.5496542304754257, "rewards/correctness_reward_func": 0.583333358168602, "rewards/int_reward_func": 0.3333333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3333333395421505, "rewards/xmlcount_reward_func": 0.4635416716337204, "step": 213 }, { "completion_length": 86.66666984558105, "epoch": 0.11454569784557742, "grad_norm": 0.4593207538127899, "kl": 0.1518435962498188, "learning_rate": 3.262626762369525e-07, "loss": 0.0061, "reward": 1.0833333432674408, "reward_std": 0.23273734748363495, "rewards/correctness_reward_func": 0.0833333358168602, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.5, "step": 214 }, { "completion_length": 104.45833778381348, "epoch": 0.11508095811588386, "grad_norm": 0.42647111415863037, "kl": 0.21028569713234901, "learning_rate": 3.092332998903416e-07, "loss": 0.0084, "reward": 1.6458333730697632, "reward_std": 0.2753772810101509, "rewards/correctness_reward_func": 0.4166666865348816, "rewards/int_reward_func": 0.4166666679084301, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3333333358168602, "rewards/xmlcount_reward_func": 0.4791666716337204, "step": 215 }, { "completion_length": 325.2500114440918, "epoch": 0.11561621838619028, "grad_norm": 0.5365056991577148, "kl": 0.134639460593462, "learning_rate": 2.9263101785268253e-07, "loss": 0.0054, "reward": 1.604166679084301, "reward_std": 0.14088044688105583, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.2916666679084301, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.4375, "step": 216 }, { "completion_length": 89.54166793823242, "epoch": 0.11615147865649672, "grad_norm": 0.6397629976272583, "kl": 0.21499066427350044, "learning_rate": 2.764590667717562e-07, "loss": 0.0086, "reward": 1.5, "reward_std": 0.19364917278289795, "rewards/correctness_reward_func": 0.0833333358168602, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 217 }, { "completion_length": 85.50000095367432, "epoch": 0.11668673892680316, "grad_norm": 1.3944755792617798, "kl": 0.13307987339794636, "learning_rate": 2.6072059940146775e-07, "loss": 0.0053, "reward": 1.541666716337204, "reward_std": 0.5103103779256344, "rewards/correctness_reward_func": 0.1666666716337204, "rewards/int_reward_func": 0.39583333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 218 }, { "completion_length": 80.66666793823242, "epoch": 0.1172219991971096, "grad_norm": 0.08477512001991272, "kl": 0.16065896674990654, "learning_rate": 2.454186839872158e-07, "loss": 0.0064, "reward": 1.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 219 }, { "completion_length": 72.79166793823242, "epoch": 0.11775725946741603, "grad_norm": 0.6473060250282288, "kl": 0.2047443389892578, "learning_rate": 2.3055630366772857e-07, "loss": 0.0082, "reward": 1.6666666865348816, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func": 0.4166666865348816, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 220 }, { "completion_length": 255.87500381469727, "epoch": 0.11829251973772247, "grad_norm": 1.165757656097412, "kl": 0.34885890036821365, "learning_rate": 2.1613635589349756e-07, "loss": 0.014, "reward": 1.2083333432674408, "reward_std": 0.5354874432086945, "rewards/correctness_reward_func": 0.0833333358168602, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3750000074505806, "rewards/xmlcount_reward_func": 0.4375, "step": 221 }, { "completion_length": 68.04166889190674, "epoch": 0.1188277800080289, "grad_norm": 1.5734628438949585, "kl": 0.35767246037721634, "learning_rate": 2.0216165186191406e-07, "loss": 0.0143, "reward": 1.729166716337204, "reward_std": 0.4259376786649227, "rewards/correctness_reward_func": 0.5000000223517418, "rewards/int_reward_func": 0.25000000558793545, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 222 }, { "completion_length": 93.95833587646484, "epoch": 0.11936304027833534, "grad_norm": 0.3049551248550415, "kl": 0.14186285808682442, "learning_rate": 1.8863491596921745e-07, "loss": 0.0057, "reward": 1.375, "reward_std": 0.25, "rewards/correctness_reward_func": 0.0833333358168602, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666679084301, "rewards/xmlcount_reward_func": 0.5, "step": 223 }, { "completion_length": 170.7500057220459, "epoch": 0.11989830054864177, "grad_norm": 0.8757752180099487, "kl": 0.17503754422068596, "learning_rate": 1.7555878527937164e-07, "loss": 0.007, "reward": 1.1865417063236237, "reward_std": 0.413798563182354, "rewards/correctness_reward_func": 0.0833333358168602, "rewards/int_reward_func": 0.2291666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333358168602, "rewards/xmlcount_reward_func": 0.4782083332538605, "step": 224 }, { "completion_length": 191.8333396911621, "epoch": 0.12043356081894821, "grad_norm": 1.3906254768371582, "kl": 0.11161109246313572, "learning_rate": 1.629358090099639e-07, "loss": 0.0045, "reward": 0.96875, "reward_std": 0.29255440831184387, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.1875000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125000037252903, "rewards/xmlcount_reward_func": 0.46875, "step": 225 }, { "completion_length": 150.75000762939453, "epoch": 0.12096882108925465, "grad_norm": 0.7032321691513062, "kl": 0.23093389347195625, "learning_rate": 1.507684480352292e-07, "loss": 0.0092, "reward": 1.651041716337204, "reward_std": 0.36715345084667206, "rewards/correctness_reward_func": 0.3333333432674408, "rewards/int_reward_func": 0.4375000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.39583333395421505, "rewards/xmlcount_reward_func": 0.484375, "step": 226 }, { "completion_length": 173.04166793823242, "epoch": 0.12150408135956109, "grad_norm": 0.777036726474762, "kl": 0.22725828364491463, "learning_rate": 1.3905907440629752e-07, "loss": 0.0091, "reward": 1.333333358168602, "reward_std": 0.3425312591716647, "rewards/correctness_reward_func": 0.0833333358168602, "rewards/int_reward_func": 0.3750000037252903, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333358168602, "rewards/xmlcount_reward_func": 0.4791666716337204, "step": 227 }, { "completion_length": 114.66667079925537, "epoch": 0.12203934162986753, "grad_norm": 2.417041778564453, "kl": 0.15211265347898006, "learning_rate": 1.278099708887587e-07, "loss": 0.0061, "reward": 2.067708373069763, "reward_std": 0.8522266149520874, "rewards/correctness_reward_func": 0.6666666716337204, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.484375, "step": 228 }, { "completion_length": 71.45833587646484, "epoch": 0.12257460190017395, "grad_norm": 1.4324641227722168, "kl": 0.2629435919225216, "learning_rate": 1.1702333051763271e-07, "loss": 0.0105, "reward": 1.4375000149011612, "reward_std": 0.4443886801600456, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.20833333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 229 }, { "completion_length": 265.95834159851074, "epoch": 0.1231098621704804, "grad_norm": 0.49865278601646423, "kl": 0.08768011070787907, "learning_rate": 1.067012561698319e-07, "loss": 0.0035, "reward": 0.8750000149011612, "reward_std": 0.42222120985388756, "rewards/correctness_reward_func": 0.0833333358168602, "rewards/int_reward_func": 0.02083333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3333333432674408, "rewards/xmlcount_reward_func": 0.4375, "step": 230 }, { "completion_length": 365.291672706604, "epoch": 0.12364512244078683, "grad_norm": 0.5076226592063904, "kl": 0.13591468706727028, "learning_rate": 9.684576015420277e-08, "loss": 0.0054, "reward": 1.083333358168602, "reward_std": 0.49768130481243134, "rewards/correctness_reward_func": 0.1666666716337204, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.27083333395421505, "rewards/xmlcount_reward_func": 0.3958333358168602, "step": 231 }, { "completion_length": 95.41666793823242, "epoch": 0.12418038271109327, "grad_norm": 1.2022953033447266, "kl": 0.14907664991915226, "learning_rate": 8.745876381922147e-08, "loss": 0.006, "reward": 1.3697917461395264, "reward_std": 0.4918174706399441, "rewards/correctness_reward_func": 0.1666666716337204, "rewards/int_reward_func": 0.31250000186264515, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.4739583358168602, "step": 232 }, { "completion_length": 113.79166793823242, "epoch": 0.1247156429813997, "grad_norm": 0.5673840641975403, "kl": 0.26192033290863037, "learning_rate": 7.854209717842231e-08, "loss": 0.0105, "reward": 1.3125000149011612, "reward_std": 0.15864631533622742, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.37500000558793545, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 233 }, { "completion_length": 112.12500190734863, "epoch": 0.12525090325170615, "grad_norm": 1.3854151964187622, "kl": 0.17152241989970207, "learning_rate": 7.009749855363457e-08, "loss": 0.0069, "reward": 1.4375000596046448, "reward_std": 0.4543575756251812, "rewards/correctness_reward_func": 0.1666666716337204, "rewards/int_reward_func": 0.33333333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.5, "step": 234 }, { "completion_length": 98.75000381469727, "epoch": 0.12578616352201258, "grad_norm": 0.790249764919281, "kl": 0.14792344719171524, "learning_rate": 6.212661423609184e-08, "loss": 0.0059, "reward": 1.7291666865348816, "reward_std": 0.5564306005835533, "rewards/correctness_reward_func": 0.3333333432674408, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 235 }, { "completion_length": 144.8333396911621, "epoch": 0.126321423792319, "grad_norm": 0.2076501101255417, "kl": 0.18184524960815907, "learning_rate": 5.463099816548578e-08, "loss": 0.0073, "reward": 1.4479166865348816, "reward_std": 0.2685803771018982, "rewards/correctness_reward_func": 0.0833333358168602, "rewards/int_reward_func": 0.4583333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.46875, "step": 236 }, { "completion_length": 142.12500190734863, "epoch": 0.12685668406262546, "grad_norm": 1.9539458751678467, "kl": 0.24447984993457794, "learning_rate": 4.761211162702117e-08, "loss": 0.0098, "reward": 1.2968750596046448, "reward_std": 0.29226116091012955, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3333333469927311, "rewards/xmlcount_reward_func": 0.484375, "step": 237 }, { "completion_length": 72.37500381469727, "epoch": 0.12739194433293188, "grad_norm": 1.3912091255187988, "kl": 0.29350727051496506, "learning_rate": 4.1071322966535487e-08, "loss": 0.0117, "reward": 1.291666716337204, "reward_std": 0.2096773497760296, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.3125000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 238 }, { "completion_length": 82.41667175292969, "epoch": 0.12792720460323834, "grad_norm": 0.43811848759651184, "kl": 0.202481709420681, "learning_rate": 3.5009907323737826e-08, "loss": 0.0081, "reward": 1.2083333432674408, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.2291666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 239 }, { "completion_length": 85.45833587646484, "epoch": 0.12846246487354476, "grad_norm": 0.8119534850120544, "kl": 0.23333512246608734, "learning_rate": 2.9429046383618042e-08, "loss": 0.0093, "reward": 1.1666666865348816, "reward_std": 0.11949635669589043, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.1666666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 240 }, { "completion_length": 160.91666984558105, "epoch": 0.1289977251438512, "grad_norm": 1.1854231357574463, "kl": 0.15807193890213966, "learning_rate": 2.4329828146074096e-08, "loss": 0.0063, "reward": 0.8219999894499779, "reward_std": 0.6791375987231731, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.1875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375000074505806, "rewards/xmlcount_reward_func": 0.19699998199939728, "step": 241 }, { "completion_length": 72.75000190734863, "epoch": 0.12953298541415764, "grad_norm": 0.45305031538009644, "kl": 0.20153097435832024, "learning_rate": 1.9713246713805588e-08, "loss": 0.0081, "reward": 1.3125, "reward_std": 0.22008520364761353, "rewards/correctness_reward_func": 0.0833333358168602, "rewards/int_reward_func": 0.2291666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 242 }, { "completion_length": 152.1666717529297, "epoch": 0.13006824568446407, "grad_norm": 0.4157455861568451, "kl": 0.1709270216524601, "learning_rate": 1.5580202098509078e-08, "loss": 0.0068, "reward": 1.4427083432674408, "reward_std": 0.345734566450119, "rewards/correctness_reward_func": 0.0833333358168602, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666679084301, "rewards/xmlcount_reward_func": 0.484375, "step": 243 }, { "completion_length": 72.79166889190674, "epoch": 0.1306035059547705, "grad_norm": 1.6172090768814087, "kl": 0.1881002075970173, "learning_rate": 1.193150004542204e-08, "loss": 0.0075, "reward": 1.4322916865348816, "reward_std": 0.26791293919086456, "rewards/correctness_reward_func": 0.0833333358168602, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.4947916716337204, "step": 244 }, { "completion_length": 63.541666984558105, "epoch": 0.13113876622507695, "grad_norm": 0.6279511451721191, "kl": 0.26533466950058937, "learning_rate": 8.767851876239075e-09, "loss": 0.0106, "reward": 1.2708333432674408, "reward_std": 0.05103103816509247, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.27083333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5, "step": 245 }, { "completion_length": 188.20833587646484, "epoch": 0.13167402649538337, "grad_norm": 0.917432963848114, "kl": 0.16238786652684212, "learning_rate": 6.089874350439507e-09, "loss": 0.0065, "reward": 1.6145834028720856, "reward_std": 0.7706367075443268, "rewards/correctness_reward_func": 0.3333333432674408, "rewards/int_reward_func": 0.3958333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4166666716337204, "rewards/xmlcount_reward_func": 0.46875, "step": 246 }, { "completion_length": 102.25000381469727, "epoch": 0.13220928676568983, "grad_norm": 0.6094866394996643, "kl": 0.1517469845712185, "learning_rate": 3.8980895450474455e-09, "loss": 0.0061, "reward": 1.2708333432674408, "reward_std": 0.27258947491645813, "rewards/correctness_reward_func": 0.0833333358168602, "rewards/int_reward_func": 0.2291666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4583333358168602, "rewards/xmlcount_reward_func": 0.5, "step": 247 }, { "completion_length": 265.0000114440918, "epoch": 0.13274454703599625, "grad_norm": 1.4663077592849731, "kl": 0.1445157825946808, "learning_rate": 2.192924752854042e-09, "loss": 0.0058, "reward": 0.9218750074505806, "reward_std": 0.21220333129167557, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.18750000186264515, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 0.421875, "step": 248 }, { "completion_length": 78.83333587646484, "epoch": 0.13327980730630268, "grad_norm": 1.0312261581420898, "kl": 0.16930826753377914, "learning_rate": 9.747123991141193e-10, "loss": 0.0068, "reward": 1.229166716337204, "reward_std": 0.1530931033194065, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.25000000558793545, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666716337204, "rewards/xmlcount_reward_func": 0.5, "step": 249 }, { "completion_length": 271.9166793823242, "epoch": 0.13381506757660913, "grad_norm": 0.3518020808696747, "kl": 0.12262176536023617, "learning_rate": 2.43689976739403e-10, "loss": 0.0049, "reward": 1.5572916716337204, "reward_std": 0.25069504231214523, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.20833333395421505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333395421505, "rewards/xmlcount_reward_func": 0.453125, "step": 250 } ], "logging_steps": 1, "max_steps": 250, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }