grpo_lora / trainer_state.json
wendyaw's picture
Upload folder using huggingface_hub
582d5db verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.13381506757660913,
"eval_steps": 500,
"global_step": 250,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 281.1666851043701,
"epoch": 0.0005352602703064365,
"grad_norm": 0.5917035937309265,
"kl": 0.0,
"learning_rate": 0.0,
"loss": -0.0,
"reward": -0.10487502068281174,
"reward_std": 0.644918380305171,
"rewards/correctness_reward_func": 0.1666666716337204,
"rewards/int_reward_func": 0.1041666716337204,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.37570834904909134,
"step": 1
},
{
"completion_length": 590.8750171661377,
"epoch": 0.001070520540612873,
"grad_norm": 0.7477704286575317,
"kl": 0.0,
"learning_rate": 2.0000000000000002e-07,
"loss": 0.0,
"reward": 0.22162500163540244,
"reward_std": 0.09485530573874712,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.09662500163540244,
"step": 2
},
{
"completion_length": 539.6666870117188,
"epoch": 0.0016057808109193096,
"grad_norm": 0.4429571032524109,
"kl": 0.002077269156870898,
"learning_rate": 4.0000000000000003e-07,
"loss": 0.0001,
"reward": 0.016208335757255554,
"reward_std": 0.6246479228138924,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.2083333358168602,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.1921250014565885,
"step": 3
},
{
"completion_length": 185.7916717529297,
"epoch": 0.002141041081225746,
"grad_norm": 0.6855182647705078,
"kl": 0.0009879921708488837,
"learning_rate": 6.000000000000001e-07,
"loss": 0.0,
"reward": 1.0669583305716515,
"reward_std": 0.5203845072537661,
"rewards/correctness_reward_func": 0.7500000298023224,
"rewards/int_reward_func": 0.3541666716337204,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.03720833268016577,
"step": 4
},
{
"completion_length": 587.3750076293945,
"epoch": 0.0026763013515321826,
"grad_norm": 0.7880218029022217,
"kl": 0.0007925744503154419,
"learning_rate": 8.000000000000001e-07,
"loss": 0.0,
"reward": 0.09533333079889417,
"reward_std": 0.5927679911255836,
"rewards/correctness_reward_func": 0.0833333358168602,
"rewards/int_reward_func": 0.1041666679084301,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.09216667525470257,
"step": 5
},
{
"completion_length": 245.70834159851074,
"epoch": 0.003211561621838619,
"grad_norm": 0.9220851063728333,
"kl": 0.001330614773905836,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.0001,
"reward": 0.08295834437012672,
"reward_std": 0.5829638005234301,
"rewards/correctness_reward_func": 0.0833333358168602,
"rewards/int_reward_func": 0.0416666679084301,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.04204164445400238,
"step": 6
},
{
"completion_length": 448.7083549499512,
"epoch": 0.0037468218921450553,
"grad_norm": 0.8303655385971069,
"kl": 0.0012542481999844313,
"learning_rate": 1.2000000000000002e-06,
"loss": 0.0001,
"reward": 0.03099999949336052,
"reward_std": 0.7598095312714577,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.16666666977107525,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.13566668145358562,
"step": 7
},
{
"completion_length": 197.95833587646484,
"epoch": 0.004282082162451492,
"grad_norm": 1.0582274198532104,
"kl": 0.0008733256690902635,
"learning_rate": 1.4000000000000001e-06,
"loss": 0.0,
"reward": 0.12341666966676712,
"reward_std": 0.09142086654901505,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.12341666780412197,
"step": 8
},
{
"completion_length": 176.41666984558105,
"epoch": 0.004817342432757929,
"grad_norm": 1.0369133949279785,
"kl": 0.005098502180771902,
"learning_rate": 1.6000000000000001e-06,
"loss": 0.0002,
"reward": 0.32220835238695145,
"reward_std": 0.3896215371787548,
"rewards/correctness_reward_func": 0.0833333358168602,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.11387500539422035,
"step": 9
},
{
"completion_length": 267.66667556762695,
"epoch": 0.005352602703064365,
"grad_norm": 0.4552709460258484,
"kl": 0.001112774269131478,
"learning_rate": 1.8000000000000001e-06,
"loss": 0.0,
"reward": 0.31854166090488434,
"reward_std": 0.3717608004808426,
"rewards/correctness_reward_func": 0.1666666716337204,
"rewards/int_reward_func": 0.1250000037252903,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.02687499998137355,
"step": 10
},
{
"completion_length": 319.58333587646484,
"epoch": 0.005887862973370801,
"grad_norm": 0.7992357015609741,
"kl": 0.0012863876472692937,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.0001,
"reward": 0.2948333490639925,
"reward_std": 0.38963131979107857,
"rewards/correctness_reward_func": 0.0833333358168602,
"rewards/int_reward_func": 0.10416666977107525,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.10733332857489586,
"step": 11
},
{
"completion_length": 375.29166984558105,
"epoch": 0.006423123243677238,
"grad_norm": 0.965032160282135,
"kl": 0.0008803782802715432,
"learning_rate": 2.2e-06,
"loss": 0.0,
"reward": 1.012750007212162,
"reward_std": 0.5616761147975922,
"rewards/correctness_reward_func": 0.8333333432674408,
"rewards/int_reward_func": 0.22916666977107525,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.04975000023841858,
"step": 12
},
{
"completion_length": 482.4583435058594,
"epoch": 0.0069583835139836745,
"grad_norm": 1.502334713935852,
"kl": 0.0028791724907932803,
"learning_rate": 2.4000000000000003e-06,
"loss": 0.0001,
"reward": -0.32758333161473274,
"reward_std": 0.7293146029114723,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.02083333395421505,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.34841667115688324,
"step": 13
},
{
"completion_length": 236.41667366027832,
"epoch": 0.007493643784290111,
"grad_norm": 0.8761454820632935,
"kl": 0.0018893379892688245,
"learning_rate": 2.6e-06,
"loss": 0.0001,
"reward": -0.017458327114582062,
"reward_std": 0.31026666425168514,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.06250000186264515,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.07995832804590464,
"step": 14
},
{
"completion_length": 306.6250104904175,
"epoch": 0.008028904054596548,
"grad_norm": 1.070983648300171,
"kl": 0.0017425262776669115,
"learning_rate": 2.8000000000000003e-06,
"loss": 0.0001,
"reward": -0.08808333426713943,
"reward_std": 0.37953382171690464,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.08808333426713943,
"step": 15
},
{
"completion_length": 366.6666793823242,
"epoch": 0.008564164324902984,
"grad_norm": 0.6342064738273621,
"kl": 0.0009713478648336604,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.03991668112576008,
"reward_std": 0.8613052181899548,
"rewards/correctness_reward_func": 0.1666666716337204,
"rewards/int_reward_func": 0.0416666679084301,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.16841666959226131,
"step": 16
},
{
"completion_length": 297.75000381469727,
"epoch": 0.00909942459520942,
"grad_norm": 0.629483163356781,
"kl": 0.0008868449804140255,
"learning_rate": 3.2000000000000003e-06,
"loss": 0.0,
"reward": 1.0445833504199982,
"reward_std": 0.3152644243091345,
"rewards/correctness_reward_func": 0.6666666716337204,
"rewards/int_reward_func": 0.3333333358168602,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.044583337381482124,
"step": 17
},
{
"completion_length": 185.79166793823242,
"epoch": 0.009634684865515858,
"grad_norm": 1.0232924222946167,
"kl": 0.0012610588310053572,
"learning_rate": 3.4000000000000005e-06,
"loss": 0.0001,
"reward": 0.21658334136009216,
"reward_std": 0.18211832642555237,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.15408333763480186,
"step": 18
},
{
"completion_length": 361.5416679382324,
"epoch": 0.010169945135822294,
"grad_norm": 0.8113482594490051,
"kl": 0.0015902465383987874,
"learning_rate": 3.6000000000000003e-06,
"loss": 0.0001,
"reward": 0.44666668586432934,
"reward_std": 0.5656739473342896,
"rewards/correctness_reward_func": 0.2500000074505806,
"rewards/int_reward_func": 0.18750000558793545,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.009166665724478662,
"step": 19
},
{
"completion_length": 254.66666984558105,
"epoch": 0.01070520540612873,
"grad_norm": 1.1768397092819214,
"kl": 0.004655150449252687,
"learning_rate": 3.8000000000000005e-06,
"loss": 0.0002,
"reward": 0.48504166305065155,
"reward_std": 0.4368314128369093,
"rewards/correctness_reward_func": 0.3333333432674408,
"rewards/int_reward_func": 0.0833333358168602,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.06837500259280205,
"step": 20
},
{
"completion_length": 137.91667366027832,
"epoch": 0.011240465676435166,
"grad_norm": 1.0621542930603027,
"kl": 0.0022582018573302776,
"learning_rate": 4.000000000000001e-06,
"loss": 0.0001,
"reward": 0.3660416714847088,
"reward_std": 0.5700555201619864,
"rewards/correctness_reward_func": 0.1666666716337204,
"rewards/int_reward_func": 0.1875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.011874992400407791,
"step": 21
},
{
"completion_length": 88.91667175292969,
"epoch": 0.011775725946741603,
"grad_norm": 1.2175222635269165,
"kl": 0.0020840048528043553,
"learning_rate": 4.2000000000000004e-06,
"loss": 0.0001,
"reward": 0.8160417033359408,
"reward_std": 0.26041645370423794,
"rewards/correctness_reward_func": 0.4166666865348816,
"rewards/int_reward_func": 0.25,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.14937500189989805,
"step": 22
},
{
"completion_length": 150.2083339691162,
"epoch": 0.01231098621704804,
"grad_norm": 0.8459586501121521,
"kl": 0.0017513818893348798,
"learning_rate": 4.4e-06,
"loss": 0.0001,
"reward": 0.5479583460837603,
"reward_std": 0.6487118303775787,
"rewards/correctness_reward_func": 0.25,
"rewards/int_reward_func": 0.2916666716337204,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0062916697934269905,
"step": 23
},
{
"completion_length": 154.87500381469727,
"epoch": 0.012846246487354477,
"grad_norm": 0.8675287961959839,
"kl": 0.001371489226585254,
"learning_rate": 4.600000000000001e-06,
"loss": 0.0001,
"reward": 0.17075001262128353,
"reward_std": 0.19458706118166447,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.14583333395421505,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.024916673079133034,
"step": 24
},
{
"completion_length": 150.7500057220459,
"epoch": 0.013381506757660913,
"grad_norm": 0.8717443943023682,
"kl": 0.0020001856610178947,
"learning_rate": 4.800000000000001e-06,
"loss": 0.0001,
"reward": 0.20204169023782015,
"reward_std": 0.4490640014410019,
"rewards/correctness_reward_func": 0.1666666716337204,
"rewards/int_reward_func": 0.06250000186264515,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.027124996297061443,
"step": 25
},
{
"completion_length": 467.9583568572998,
"epoch": 0.013916767027967349,
"grad_norm": 1.1116266250610352,
"kl": 0.0012415697274263948,
"learning_rate": 5e-06,
"loss": 0.0,
"reward": 0.304541677236557,
"reward_std": 0.28451096825301647,
"rewards/correctness_reward_func": 0.0833333358168602,
"rewards/int_reward_func": 0.14583333395421505,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.07537499908357859,
"step": 26
},
{
"completion_length": 433.5833339691162,
"epoch": 0.014452027298273785,
"grad_norm": 0.8203855156898499,
"kl": 0.001017560571199283,
"learning_rate": 4.999756310023261e-06,
"loss": 0.0,
"reward": 0.2795000094920397,
"reward_std": 0.11828233953565359,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.14583333395421505,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.13366666994988918,
"step": 27
},
{
"completion_length": 126.00000381469727,
"epoch": 0.014987287568580221,
"grad_norm": 1.2663164138793945,
"kl": 0.0014343319344334304,
"learning_rate": 4.999025287600886e-06,
"loss": 0.0001,
"reward": 0.6117500364780426,
"reward_std": 0.4831337593495846,
"rewards/correctness_reward_func": 0.1666666716337204,
"rewards/int_reward_func": 0.2083333358168602,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.2367500104010105,
"step": 28
},
{
"completion_length": 229.66667556762695,
"epoch": 0.01552254783888666,
"grad_norm": 0.8950901627540588,
"kl": 0.0015667550032958388,
"learning_rate": 4.997807075247147e-06,
"loss": 0.0001,
"reward": 0.1262916720006615,
"reward_std": 0.148749228566885,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.02083333395421505,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.1054583361838013,
"step": 29
},
{
"completion_length": 210.8750114440918,
"epoch": 0.016057808109193095,
"grad_norm": 1.1234443187713623,
"kl": 0.0019746975012822077,
"learning_rate": 4.996101910454953e-06,
"loss": 0.0001,
"reward": 0.3709999993443489,
"reward_std": 0.3687104620039463,
"rewards/correctness_reward_func": 0.0833333358168602,
"rewards/int_reward_func": 0.20833333395421505,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0793333351612091,
"step": 30
},
{
"completion_length": 177.62500190734863,
"epoch": 0.01659306837949953,
"grad_norm": 0.9238327145576477,
"kl": 0.0028813415410695598,
"learning_rate": 4.993910125649561e-06,
"loss": 0.0001,
"reward": 0.39008335024118423,
"reward_std": 0.3615882135927677,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.27083333395421505,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.11925000231713057,
"step": 31
},
{
"completion_length": 173.79167366027832,
"epoch": 0.017128328649805968,
"grad_norm": 1.0448777675628662,
"kl": 0.006160023040138185,
"learning_rate": 4.9912321481237616e-06,
"loss": 0.0002,
"reward": 0.14554167166352272,
"reward_std": 0.16822291910648346,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0416666679084301,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.10387500375509262,
"step": 32
},
{
"completion_length": 486.75000762939453,
"epoch": 0.017663588920112404,
"grad_norm": 0.6601158380508423,
"kl": 0.001103398812119849,
"learning_rate": 4.988068499954578e-06,
"loss": 0.0,
"reward": -0.5304166711866856,
"reward_std": 1.0785409808158875,
"rewards/correctness_reward_func": 0.0833333358168602,
"rewards/int_reward_func": 0.06250000186264515,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.6762500237673521,
"step": 33
},
{
"completion_length": 268.12501525878906,
"epoch": 0.01819884919041884,
"grad_norm": 0.910798966884613,
"kl": 0.0013230827316874638,
"learning_rate": 4.984419797901491e-06,
"loss": 0.0001,
"reward": 0.6703750789165497,
"reward_std": 0.36514274775981903,
"rewards/correctness_reward_func": 0.4166666865348816,
"rewards/int_reward_func": 0.12500000558793545,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.1287083402276039,
"step": 34
},
{
"completion_length": 267.708345413208,
"epoch": 0.018734109460725276,
"grad_norm": 1.027286171913147,
"kl": 0.004269710392691195,
"learning_rate": 4.980286753286196e-06,
"loss": 0.0002,
"reward": 0.4232500046491623,
"reward_std": 0.2645573355257511,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.27083334140479565,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.1524166688323021,
"step": 35
},
{
"completion_length": 245.8750114440918,
"epoch": 0.019269369731031716,
"grad_norm": 1.0046695470809937,
"kl": 0.005482323234900832,
"learning_rate": 4.975670171853926e-06,
"loss": 0.0002,
"reward": 0.2149583324790001,
"reward_std": 0.7910580635070801,
"rewards/correctness_reward_func": 0.0833333358168602,
"rewards/int_reward_func": 0.2083333395421505,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.07670832984149456,
"step": 36
},
{
"completion_length": 560.7916870117188,
"epoch": 0.019804630001338152,
"grad_norm": 0.8885159492492676,
"kl": 0.002885772308218293,
"learning_rate": 4.970570953616383e-06,
"loss": 0.0001,
"reward": 0.1944583347067237,
"reward_std": 0.05626108031719923,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.1944583347067237,
"step": 37
},
{
"completion_length": 522.8750133514404,
"epoch": 0.020339890271644588,
"grad_norm": 0.5502146482467651,
"kl": 0.009517412836430594,
"learning_rate": 4.964990092676263e-06,
"loss": 0.0004,
"reward": 0.2919999957084656,
"reward_std": 0.674026682972908,
"rewards/correctness_reward_func": 0.25,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.08299997518770397,
"step": 38
},
{
"completion_length": 640.2083511352539,
"epoch": 0.020875150541951024,
"grad_norm": 0.7062350511550903,
"kl": 0.0034277847153134644,
"learning_rate": 4.958928677033465e-06,
"loss": 0.0001,
"reward": -0.004291646182537079,
"reward_std": 0.7502853199839592,
"rewards/correctness_reward_func": 0.0833333358168602,
"rewards/int_reward_func": 0.1250000037252903,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.21262501180171967,
"step": 39
},
{
"completion_length": 491.08335876464844,
"epoch": 0.02141041081225746,
"grad_norm": 0.7743443250656128,
"kl": 0.004902548622339964,
"learning_rate": 4.9523878883729794e-06,
"loss": 0.0002,
"reward": 0.06212499737739563,
"reward_std": 0.8643132671713829,
"rewards/correctness_reward_func": 0.0833333358168602,
"rewards/int_reward_func": 0.1458333395421505,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.1670416765846312,
"step": 40
},
{
"completion_length": 301.45834159851074,
"epoch": 0.021945671082563897,
"grad_norm": 0.7774906754493713,
"kl": 0.007360402669291943,
"learning_rate": 4.9453690018345144e-06,
"loss": 0.0003,
"reward": -0.09349998086690903,
"reward_std": 0.7777874618768692,
"rewards/correctness_reward_func": 0.0833333358168602,
"rewards/int_reward_func": 0.1041666679084301,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.2810000069439411,
"step": 41
},
{
"completion_length": 455.5833435058594,
"epoch": 0.022480931352870333,
"grad_norm": 0.6825084090232849,
"kl": 0.0033266296959482133,
"learning_rate": 4.937873385763909e-06,
"loss": 0.0001,
"reward": 0.2603750079870224,
"reward_std": 1.1693618446588516,
"rewards/correctness_reward_func": 0.2500000074505806,
"rewards/int_reward_func": 0.1458333358168602,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.02083333395421505,
"rewards/xmlcount_reward_func": -0.15629167575389147,
"step": 42
},
{
"completion_length": 254.7500057220459,
"epoch": 0.02301619162317677,
"grad_norm": 1.0424985885620117,
"kl": 0.013458715460728854,
"learning_rate": 4.9299025014463665e-06,
"loss": 0.0005,
"reward": 0.6490416824817657,
"reward_std": 0.6297374591231346,
"rewards/correctness_reward_func": 0.1666666716337204,
"rewards/int_reward_func": 0.3125000111758709,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.16987500933464617,
"step": 43
},
{
"completion_length": 243.8333339691162,
"epoch": 0.023551451893483205,
"grad_norm": 0.6618691682815552,
"kl": 0.013514326536096632,
"learning_rate": 4.921457902821578e-06,
"loss": 0.0005,
"reward": 0.235000004991889,
"reward_std": 0.4793561212718487,
"rewards/correctness_reward_func": 0.0833333358168602,
"rewards/int_reward_func": 0.12500000558793545,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.02666667103767395,
"step": 44
},
{
"completion_length": 247.1250057220459,
"epoch": 0.02408671216378964,
"grad_norm": 0.8150098919868469,
"kl": 0.006718623684719205,
"learning_rate": 4.912541236180779e-06,
"loss": 0.0003,
"reward": 0.3942916840314865,
"reward_std": 0.4090446010231972,
"rewards/correctness_reward_func": 0.1666666716337204,
"rewards/int_reward_func": 0.1458333358168602,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.08179166866466403,
"step": 45
},
{
"completion_length": 311.95833587646484,
"epoch": 0.02462197243409608,
"grad_norm": 0.9185469746589661,
"kl": 0.011615818890277296,
"learning_rate": 4.903154239845798e-06,
"loss": 0.0005,
"reward": 0.6513333357870579,
"reward_std": 0.26453326642513275,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.14583333395421505,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.005500006955116987,
"step": 46
},
{
"completion_length": 347.25000762939453,
"epoch": 0.025157232704402517,
"grad_norm": 0.9208869338035583,
"kl": 0.019979659002274275,
"learning_rate": 4.893298743830168e-06,
"loss": 0.0008,
"reward": 0.4205416589975357,
"reward_std": 0.4555768258869648,
"rewards/correctness_reward_func": 0.25,
"rewards/int_reward_func": 0.0625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.108041662722826,
"step": 47
},
{
"completion_length": 245.6666717529297,
"epoch": 0.025692492974708953,
"grad_norm": 0.8743010759353638,
"kl": 0.01431413902901113,
"learning_rate": 4.882976669482368e-06,
"loss": 0.0006,
"reward": 0.269333329051733,
"reward_std": 0.6685181586071849,
"rewards/correctness_reward_func": 0.0833333358168602,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.06099999323487282,
"step": 48
},
{
"completion_length": 431.12501525878906,
"epoch": 0.02622775324501539,
"grad_norm": 0.6829207539558411,
"kl": 0.010582708870060742,
"learning_rate": 4.8721900291112415e-06,
"loss": 0.0004,
"reward": 1.1573750227689743,
"reward_std": 0.5191336497664452,
"rewards/correctness_reward_func": 0.75,
"rewards/int_reward_func": 0.27083333767950535,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.1365416720509529,
"step": 49
},
{
"completion_length": 362.2916793823242,
"epoch": 0.026763013515321826,
"grad_norm": 0.9591223001480103,
"kl": 0.021306635811924934,
"learning_rate": 4.860940925593703e-06,
"loss": 0.0009,
"reward": 0.5036666616797447,
"reward_std": 0.6060219556093216,
"rewards/correctness_reward_func": 0.0833333358168602,
"rewards/int_reward_func": 0.16666666977107525,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.06250000186264515,
"rewards/xmlcount_reward_func": 0.19116667530033737,
"step": 50
},
{
"completion_length": 105.25,
"epoch": 0.02729827378562826,
"grad_norm": 1.2040213346481323,
"kl": 0.03592631733044982,
"learning_rate": 4.849231551964771e-06,
"loss": 0.0014,
"reward": 0.9875416681170464,
"reward_std": 0.5127010717988014,
"rewards/correctness_reward_func": 0.4166666865348816,
"rewards/int_reward_func": 0.20833334140479565,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0416666679084301,
"rewards/xmlcount_reward_func": 0.32087502628564835,
"step": 51
},
{
"completion_length": 105.50000190734863,
"epoch": 0.027833534055934698,
"grad_norm": 0.9755409359931946,
"kl": 0.03177254740148783,
"learning_rate": 4.837064190990036e-06,
"loss": 0.0013,
"reward": 0.36112499982118607,
"reward_std": 0.23680441081523895,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.14583333395421505,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.02083333395421505,
"rewards/xmlcount_reward_func": 0.19445833936333656,
"step": 52
},
{
"completion_length": 115.08333778381348,
"epoch": 0.028368794326241134,
"grad_norm": 1.0774939060211182,
"kl": 0.04096163995563984,
"learning_rate": 4.824441214720629e-06,
"loss": 0.0016,
"reward": 0.39137500151991844,
"reward_std": 0.21243570372462273,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0416666679084301,
"rewards/xmlcount_reward_func": 0.22470833733677864,
"step": 53
},
{
"completion_length": 71.87500095367432,
"epoch": 0.02890405459654757,
"grad_norm": 1.5860177278518677,
"kl": 0.07264666631817818,
"learning_rate": 4.811365084030784e-06,
"loss": 0.0029,
"reward": 0.7297500222921371,
"reward_std": 0.40594012290239334,
"rewards/correctness_reward_func": 0.0833333358168602,
"rewards/int_reward_func": 0.25000000558793545,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0416666679084301,
"rewards/xmlcount_reward_func": 0.35475001111626625,
"step": 54
},
{
"completion_length": 335.0416831970215,
"epoch": 0.029439314866854006,
"grad_norm": 0.8996623158454895,
"kl": 0.03017168352380395,
"learning_rate": 4.7978383481380865e-06,
"loss": 0.0012,
"reward": 0.21566667163278908,
"reward_std": 0.38637750223279,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0416666679084301,
"rewards/xmlcount_reward_func": 0.11149999999906868,
"step": 55
},
{
"completion_length": 116.00000476837158,
"epoch": 0.029974575137160443,
"grad_norm": 1.1853581666946411,
"kl": 0.03844497771933675,
"learning_rate": 4.783863644106502e-06,
"loss": 0.0015,
"reward": 0.35095833986997604,
"reward_std": 0.21118691470474005,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0416666679084301,
"rewards/xmlcount_reward_func": 0.18429166823625565,
"step": 56
},
{
"completion_length": 203.58334350585938,
"epoch": 0.030509835407466882,
"grad_norm": 0.9677571654319763,
"kl": 0.03654019068926573,
"learning_rate": 4.769443696332272e-06,
"loss": 0.0015,
"reward": 0.867875000461936,
"reward_std": 0.6674522012472153,
"rewards/correctness_reward_func": 0.3333333432674408,
"rewards/int_reward_func": 0.2708333395421505,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.08333333395421505,
"rewards/xmlcount_reward_func": 0.1803750041872263,
"step": 57
},
{
"completion_length": 226.58334159851074,
"epoch": 0.03104509567777332,
"grad_norm": 0.7546242475509644,
"kl": 0.10618894919753075,
"learning_rate": 4.754581316012785e-06,
"loss": 0.0042,
"reward": 0.7405833136290312,
"reward_std": 1.0614993423223495,
"rewards/correctness_reward_func": 0.25,
"rewards/int_reward_func": 0.2708333358168602,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.125,
"rewards/xmlcount_reward_func": 0.09474998340010643,
"step": 58
},
{
"completion_length": 96.75000190734863,
"epoch": 0.03158035594807975,
"grad_norm": 0.9959896206855774,
"kl": 0.040049958042800426,
"learning_rate": 4.7392794005985324e-06,
"loss": 0.0016,
"reward": 1.0202916860580444,
"reward_std": 0.5125212594866753,
"rewards/correctness_reward_func": 0.2500000074505806,
"rewards/int_reward_func": 0.4166666679084301,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.35362500697374344,
"step": 59
},
{
"completion_length": 77.83333587646484,
"epoch": 0.03211561621838619,
"grad_norm": 1.0476152896881104,
"kl": 0.08419935218989849,
"learning_rate": 4.723540933228245e-06,
"loss": 0.0034,
"reward": 0.6385000422596931,
"reward_std": 0.23676574788987637,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.27083333767950535,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.36766667664051056,
"step": 60
},
{
"completion_length": 76.0416669845581,
"epoch": 0.03265087648869262,
"grad_norm": 1.6621463298797607,
"kl": 0.06288609141483903,
"learning_rate": 4.707368982147318e-06,
"loss": 0.0025,
"reward": 0.7950416952371597,
"reward_std": 0.2815567087382078,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.2916666716337204,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.1458333395421505,
"rewards/xmlcount_reward_func": 0.35754168033599854,
"step": 61
},
{
"completion_length": 97.87500190734863,
"epoch": 0.03318613675899906,
"grad_norm": 1.3513109683990479,
"kl": 0.07356535829603672,
"learning_rate": 4.690766700109659e-06,
"loss": 0.0029,
"reward": 1.054708331823349,
"reward_std": 0.3879717066884041,
"rewards/correctness_reward_func": 0.1666666716337204,
"rewards/int_reward_func": 0.4791666716337204,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.06250000186264515,
"rewards/xmlcount_reward_func": 0.3463750060182065,
"step": 62
},
{
"completion_length": 356.79167556762695,
"epoch": 0.0337213970293055,
"grad_norm": 0.9732988476753235,
"kl": 0.0708354264497757,
"learning_rate": 4.673737323763048e-06,
"loss": 0.0028,
"reward": 0.3889166936278343,
"reward_std": 1.0406904257833958,
"rewards/correctness_reward_func": 0.1666666716337204,
"rewards/int_reward_func": 0.0833333358168602,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0833333358168602,
"rewards/xmlcount_reward_func": 0.05558334290981293,
"step": 63
},
{
"completion_length": 124.16666984558105,
"epoch": 0.034256657299611935,
"grad_norm": 1.4622650146484375,
"kl": 0.08344197925180197,
"learning_rate": 4.656284173018144e-06,
"loss": 0.0033,
"reward": 1.0468750149011612,
"reward_std": 0.7830385342240334,
"rewards/correctness_reward_func": 0.25,
"rewards/int_reward_func": 0.2708333432674408,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.16666666977107525,
"rewards/xmlcount_reward_func": 0.3593750074505806,
"step": 64
},
{
"completion_length": 233.333345413208,
"epoch": 0.034791917569918375,
"grad_norm": 2.0504956245422363,
"kl": 0.07171727810055017,
"learning_rate": 4.638410650401267e-06,
"loss": 0.0029,
"reward": 1.2617916613817215,
"reward_std": 0.8660007119178772,
"rewards/correctness_reward_func": 0.5000000149011612,
"rewards/int_reward_func": 0.3750000074505806,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.3242916837334633,
"step": 65
},
{
"completion_length": 147.5833339691162,
"epoch": 0.03532717784022481,
"grad_norm": 0.6744219660758972,
"kl": 0.11512961238622665,
"learning_rate": 4.620120240391065e-06,
"loss": 0.0046,
"reward": 0.8016250282526016,
"reward_std": 0.29210690781474113,
"rewards/correctness_reward_func": 0.0833333358168602,
"rewards/int_reward_func": 0.2291666716337204,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0416666679084301,
"rewards/xmlcount_reward_func": 0.44745834171772003,
"step": 66
},
{
"completion_length": 76.41666793823242,
"epoch": 0.03586243811053125,
"grad_norm": 0.8137429356575012,
"kl": 0.11052755452692509,
"learning_rate": 4.601416508739211e-06,
"loss": 0.0044,
"reward": 1.2232083678245544,
"reward_std": 0.6085939556360245,
"rewards/correctness_reward_func": 0.3333333432674408,
"rewards/int_reward_func": 0.25,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.18750000558793545,
"rewards/xmlcount_reward_func": 0.45237500965595245,
"step": 67
},
{
"completion_length": 103.91666984558105,
"epoch": 0.03639769838083768,
"grad_norm": 1.3938682079315186,
"kl": 0.10067875497043133,
"learning_rate": 4.582303101775249e-06,
"loss": 0.004,
"reward": 0.7972083538770676,
"reward_std": 0.43218278884887695,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.14583333767950535,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.22916667349636555,
"rewards/xmlcount_reward_func": 0.42220834642648697,
"step": 68
},
{
"completion_length": 607.5000114440918,
"epoch": 0.03693295865114412,
"grad_norm": 0.7114788293838501,
"kl": 0.059856235020561144,
"learning_rate": 4.562783745695738e-06,
"loss": 0.0024,
"reward": 0.6020833402872086,
"reward_std": 0.26822593063116074,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.1875000074505806,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.1041666716337204,
"rewards/xmlcount_reward_func": 0.31041666865348816,
"step": 69
},
{
"completion_length": 366.1666736602783,
"epoch": 0.03746821892145055,
"grad_norm": 1.5701572895050049,
"kl": 0.12799374386668205,
"learning_rate": 4.542862245837821e-06,
"loss": 0.0051,
"reward": 0.6387916915118694,
"reward_std": 0.3272341303527355,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.16666667349636555,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.14583333395421505,
"rewards/xmlcount_reward_func": 0.3262916784733534,
"step": 70
},
{
"completion_length": 150.9166717529297,
"epoch": 0.03800347919175699,
"grad_norm": 1.3559887409210205,
"kl": 0.17991142719984055,
"learning_rate": 4.522542485937369e-06,
"loss": 0.0072,
"reward": 0.8541667014360428,
"reward_std": 0.28774577379226685,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.2500000074505806,
"rewards/xmlcount_reward_func": 0.4791666716337204,
"step": 71
},
{
"completion_length": 92.08333778381348,
"epoch": 0.03853873946206343,
"grad_norm": 0.9042914509773254,
"kl": 0.21130416169762611,
"learning_rate": 4.501828427371834e-06,
"loss": 0.0085,
"reward": 0.7811249941587448,
"reward_std": 0.20265305042266846,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.1666666679084301,
"rewards/xmlcount_reward_func": 0.4894583374261856,
"step": 72
},
{
"completion_length": 125.45833778381348,
"epoch": 0.039073999732369864,
"grad_norm": 0.8635703921318054,
"kl": 0.13566209375858307,
"learning_rate": 4.4807241083879774e-06,
"loss": 0.0054,
"reward": 1.1882500350475311,
"reward_std": 0.8479792177677155,
"rewards/correctness_reward_func": 0.3333333432674408,
"rewards/int_reward_func": 0.1875000037252903,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.2708333358168602,
"rewards/xmlcount_reward_func": 0.39658334106206894,
"step": 73
},
{
"completion_length": 68.79166984558105,
"epoch": 0.039609260002676304,
"grad_norm": 1.192706823348999,
"kl": 0.27350207418203354,
"learning_rate": 4.4592336433146e-06,
"loss": 0.0109,
"reward": 1.7708333730697632,
"reward_std": 0.11558076366782188,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.3125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4583333432674408,
"rewards/xmlcount_reward_func": 0.5,
"step": 74
},
{
"completion_length": 90.79166984558105,
"epoch": 0.04014452027298274,
"grad_norm": 1.3415632247924805,
"kl": 0.15339597314596176,
"learning_rate": 4.437361221760449e-06,
"loss": 0.0061,
"reward": 1.0463333874940872,
"reward_std": 0.5073548853397369,
"rewards/correctness_reward_func": 0.25,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.2083333358168602,
"rewards/xmlcount_reward_func": 0.46300000697374344,
"step": 75
},
{
"completion_length": 163.20834159851074,
"epoch": 0.040679780543289176,
"grad_norm": 1.5307588577270508,
"kl": 0.1565667698159814,
"learning_rate": 4.415111107797445e-06,
"loss": 0.0063,
"reward": 1.239583358168602,
"reward_std": 0.48138320073485374,
"rewards/correctness_reward_func": 0.25,
"rewards/int_reward_func": 0.2083333358168602,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.3125000074505806,
"rewards/xmlcount_reward_func": 0.4687500074505806,
"step": 76
},
{
"completion_length": 162.20834159851074,
"epoch": 0.04121504081359561,
"grad_norm": 1.7075368165969849,
"kl": 0.2482675537467003,
"learning_rate": 4.3924876391293915e-06,
"loss": 0.0099,
"reward": 1.5780000388622284,
"reward_std": 0.7741712592542171,
"rewards/correctness_reward_func": 0.416666679084301,
"rewards/int_reward_func": 0.3333333469927311,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.37500000558793545,
"rewards/xmlcount_reward_func": 0.453000009059906,
"step": 77
},
{
"completion_length": 90.83333587646484,
"epoch": 0.04175030108390205,
"grad_norm": 1.605960726737976,
"kl": 0.1952410712838173,
"learning_rate": 4.36949522624633e-06,
"loss": 0.0078,
"reward": 1.7031250596046448,
"reward_std": 0.5450708866119385,
"rewards/correctness_reward_func": 0.6666666865348816,
"rewards/int_reward_func": 0.2083333358168602,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.3333333395421505,
"rewards/xmlcount_reward_func": 0.4947916716337204,
"step": 78
},
{
"completion_length": 136.16666984558105,
"epoch": 0.04228556135420848,
"grad_norm": 0.673092246055603,
"kl": 0.1753321774303913,
"learning_rate": 4.346138351564711e-06,
"loss": 0.007,
"reward": 1.2291666716337204,
"reward_std": 0.4924144148826599,
"rewards/correctness_reward_func": 0.1666666716337204,
"rewards/int_reward_func": 0.291666679084301,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.29166667349636555,
"rewards/xmlcount_reward_func": 0.4791666716337204,
"step": 79
},
{
"completion_length": 71.20833683013916,
"epoch": 0.04282082162451492,
"grad_norm": 1.5928924083709717,
"kl": 0.31612617522478104,
"learning_rate": 4.322421568553529e-06,
"loss": 0.0126,
"reward": 2.0416666865348816,
"reward_std": 0.20412416756153107,
"rewards/correctness_reward_func": 0.5833333358168602,
"rewards/int_reward_func": 0.4791666716337204,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4791666716337204,
"rewards/xmlcount_reward_func": 0.5,
"step": 80
},
{
"completion_length": 112.33333492279053,
"epoch": 0.043356081894821354,
"grad_norm": 0.4769633412361145,
"kl": 0.21202785894274712,
"learning_rate": 4.2983495008466285e-06,
"loss": 0.0085,
"reward": 1.7968750447034836,
"reward_std": 0.15385404229164124,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4375000074505806,
"rewards/xmlcount_reward_func": 0.484375,
"step": 81
},
{
"completion_length": 175.41667366027832,
"epoch": 0.04389134216512779,
"grad_norm": 0.9674685597419739,
"kl": 0.13749209698289633,
"learning_rate": 4.273926841341303e-06,
"loss": 0.0055,
"reward": 1.6510417461395264,
"reward_std": 0.5754482969641685,
"rewards/correctness_reward_func": 0.416666679084301,
"rewards/int_reward_func": 0.4166666716337204,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.33333334140479565,
"rewards/xmlcount_reward_func": 0.484375,
"step": 82
},
{
"completion_length": 67.87500190734863,
"epoch": 0.04442660243543423,
"grad_norm": 1.0911532640457153,
"kl": 0.28480928763747215,
"learning_rate": 4.249158351283414e-06,
"loss": 0.0114,
"reward": 1.4166666865348816,
"reward_std": 0.12909945845603943,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.4791666716337204,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4375000074505806,
"rewards/xmlcount_reward_func": 0.5,
"step": 83
},
{
"completion_length": 71.91666889190674,
"epoch": 0.044961862705740666,
"grad_norm": 1.9787883758544922,
"kl": 0.2760180849581957,
"learning_rate": 4.224048859339175e-06,
"loss": 0.011,
"reward": 1.1250000596046448,
"reward_std": 0.20412414520978928,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.25000000558793545,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.37500000558793545,
"rewards/xmlcount_reward_func": 0.5,
"step": 84
},
{
"completion_length": 124.37500190734863,
"epoch": 0.045497122976047105,
"grad_norm": 0.5233182907104492,
"kl": 0.20504293218255043,
"learning_rate": 4.198603260653792e-06,
"loss": 0.0082,
"reward": 1.2343750298023224,
"reward_std": 0.3668263405561447,
"rewards/correctness_reward_func": 0.1666666716337204,
"rewards/int_reward_func": 0.3541666716337204,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.25,
"rewards/xmlcount_reward_func": 0.4635416716337204,
"step": 85
},
{
"completion_length": 101.62500381469727,
"epoch": 0.04603238324635354,
"grad_norm": 1.2110995054244995,
"kl": 0.15592870488762856,
"learning_rate": 4.172826515897146e-06,
"loss": 0.0062,
"reward": 1.7083334028720856,
"reward_std": 0.5623037368059158,
"rewards/correctness_reward_func": 0.5000000223517418,
"rewards/int_reward_func": 0.2291666753590107,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4791666716337204,
"rewards/xmlcount_reward_func": 0.5,
"step": 86
},
{
"completion_length": 126.25000762939453,
"epoch": 0.04656764351665998,
"grad_norm": 0.7313621640205383,
"kl": 0.18805699050426483,
"learning_rate": 4.146723650296701e-06,
"loss": 0.0075,
"reward": 1.3646250218153,
"reward_std": 0.3509002774953842,
"rewards/correctness_reward_func": 0.3333333432674408,
"rewards/int_reward_func": 0.14583333395421505,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.416666679084301,
"rewards/xmlcount_reward_func": 0.4687916710972786,
"step": 87
},
{
"completion_length": 209.45833683013916,
"epoch": 0.04710290378696641,
"grad_norm": 0.725437343120575,
"kl": 0.16482173651456833,
"learning_rate": 4.120299752657828e-06,
"loss": 0.0066,
"reward": 1.046916663646698,
"reward_std": 0.4301242418587208,
"rewards/correctness_reward_func": 0.1666666716337204,
"rewards/int_reward_func": 0.1875000074505806,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.2291666716337204,
"rewards/xmlcount_reward_func": 0.4635833352804184,
"step": 88
},
{
"completion_length": 165.0833396911621,
"epoch": 0.04763816405727285,
"grad_norm": 0.7724325656890869,
"kl": 0.20376655086874962,
"learning_rate": 4.093559974371725e-06,
"loss": 0.0082,
"reward": 1.0260416865348816,
"reward_std": 0.1994822435081005,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.1250000037252903,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4166666716337204,
"rewards/xmlcount_reward_func": 0.484375,
"step": 89
},
{
"completion_length": 98.66666984558105,
"epoch": 0.04817342432757928,
"grad_norm": 1.395373821258545,
"kl": 0.2529403530061245,
"learning_rate": 4.066509528411151e-06,
"loss": 0.0101,
"reward": 1.0780000239610672,
"reward_std": 0.2602536380290985,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.1875000074505806,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.3958333432674408,
"rewards/xmlcount_reward_func": 0.4946666657924652,
"step": 90
},
{
"completion_length": 89.25,
"epoch": 0.04870868459788572,
"grad_norm": 0.8370155692100525,
"kl": 0.19209491834044456,
"learning_rate": 4.039153688314146e-06,
"loss": 0.0077,
"reward": 1.2864583879709244,
"reward_std": 0.43756843730807304,
"rewards/correctness_reward_func": 0.25,
"rewards/int_reward_func": 0.0833333358168602,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4583333432674408,
"rewards/xmlcount_reward_func": 0.4947916716337204,
"step": 91
},
{
"completion_length": 63.333335876464844,
"epoch": 0.04924394486819216,
"grad_norm": 2.1854286193847656,
"kl": 0.21549397706985474,
"learning_rate": 4.011497787155938e-06,
"loss": 0.0086,
"reward": 1.2964583337306976,
"reward_std": 0.1316254585981369,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.3541666716337204,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4583333358168602,
"rewards/xmlcount_reward_func": 0.48395833373069763,
"step": 92
},
{
"completion_length": 88.62500190734863,
"epoch": 0.049779205138498595,
"grad_norm": 1.6499943733215332,
"kl": 0.2516642101109028,
"learning_rate": 3.983547216509254e-06,
"loss": 0.0101,
"reward": 1.6613333523273468,
"reward_std": 0.6053861007094383,
"rewards/correctness_reward_func": 0.4166666716337204,
"rewards/int_reward_func": 0.3541666716337204,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.3958333358168602,
"rewards/xmlcount_reward_func": 0.4946666657924652,
"step": 93
},
{
"completion_length": 134.12500381469727,
"epoch": 0.050314465408805034,
"grad_norm": 0.9040616154670715,
"kl": 0.18027858808636665,
"learning_rate": 3.955307425393224e-06,
"loss": 0.0072,
"reward": 1.1510417014360428,
"reward_std": 0.4075661599636078,
"rewards/correctness_reward_func": 0.1666666716337204,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.3750000037252903,
"rewards/xmlcount_reward_func": 0.484375,
"step": 94
},
{
"completion_length": 143.12500381469727,
"epoch": 0.05084972567911147,
"grad_norm": 0.8769562244415283,
"kl": 0.2623859569430351,
"learning_rate": 3.92678391921108e-06,
"loss": 0.0105,
"reward": 1.3593750298023224,
"reward_std": 0.3879491835832596,
"rewards/correctness_reward_func": 0.0833333358168602,
"rewards/int_reward_func": 0.3958333395421505,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.3958333358168602,
"rewards/xmlcount_reward_func": 0.484375,
"step": 95
},
{
"completion_length": 80.58333587646484,
"epoch": 0.051384985949417906,
"grad_norm": 1.6260957717895508,
"kl": 0.2531866990029812,
"learning_rate": 3.897982258676867e-06,
"loss": 0.0101,
"reward": 1.3281250298023224,
"reward_std": 0.37103308364748955,
"rewards/correctness_reward_func": 0.0833333358168602,
"rewards/int_reward_func": 0.33333334140479565,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4375000074505806,
"rewards/xmlcount_reward_func": 0.4739583432674408,
"step": 96
},
{
"completion_length": 104.91666889190674,
"epoch": 0.05192024621972434,
"grad_norm": 0.8191968202590942,
"kl": 0.27327974885702133,
"learning_rate": 3.868908058731376e-06,
"loss": 0.0109,
"reward": 1.1041666865348816,
"reward_std": 0.16337091475725174,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.2500000037252903,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.375,
"rewards/xmlcount_reward_func": 0.4791666716337204,
"step": 97
},
{
"completion_length": 62.00000190734863,
"epoch": 0.05245550649003078,
"grad_norm": 0.3426864743232727,
"kl": 0.2767893858253956,
"learning_rate": 3.839566987447492e-06,
"loss": 0.0111,
"reward": 1.3541666865348816,
"reward_std": 0.05103103816509247,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4791666716337204,
"rewards/xmlcount_reward_func": 0.5,
"step": 98
},
{
"completion_length": 389.2500114440918,
"epoch": 0.05299076676033721,
"grad_norm": 1.0328341722488403,
"kl": 0.1228889636695385,
"learning_rate": 3.8099647649251984e-06,
"loss": 0.0049,
"reward": 1.067708358168602,
"reward_std": 0.422527939081192,
"rewards/correctness_reward_func": 0.0833333358168602,
"rewards/int_reward_func": 0.27083333395421505,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.2916666716337204,
"rewards/xmlcount_reward_func": 0.421875,
"step": 99
},
{
"completion_length": 73.62500286102295,
"epoch": 0.05352602703064365,
"grad_norm": 0.7198861241340637,
"kl": 0.2295570969581604,
"learning_rate": 3.780107162176429e-06,
"loss": 0.0092,
"reward": 1.375,
"reward_std": 0.2885505259037018,
"rewards/correctness_reward_func": 0.0833333358168602,
"rewards/int_reward_func": 0.2916666716337204,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 0.5,
"step": 100
},
{
"completion_length": 247.083345413208,
"epoch": 0.054061287300950084,
"grad_norm": 0.8113954663276672,
"kl": 0.2137942397966981,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.0086,
"reward": 1.0833750367164612,
"reward_std": 0.4043814614415169,
"rewards/correctness_reward_func": 0.0833333358168602,
"rewards/int_reward_func": 0.29166667349636555,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.2916666716337204,
"rewards/xmlcount_reward_func": 0.4167083315551281,
"step": 101
},
{
"completion_length": 161.08334159851074,
"epoch": 0.05459654757125652,
"grad_norm": 0.9631187915802002,
"kl": 0.320892296731472,
"learning_rate": 3.7196491478468322e-06,
"loss": 0.0128,
"reward": 1.7552083432674408,
"reward_std": 0.20395417511463165,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.3541666716337204,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4166666716337204,
"rewards/xmlcount_reward_func": 0.484375,
"step": 102
},
{
"completion_length": 86.87500286102295,
"epoch": 0.05513180784156296,
"grad_norm": 0.5522407293319702,
"kl": 0.19866621680557728,
"learning_rate": 3.689060522675689e-06,
"loss": 0.0079,
"reward": 1.6974583566188812,
"reward_std": 0.43488648533821106,
"rewards/correctness_reward_func": 0.4166666865348816,
"rewards/int_reward_func": 0.3333333432674408,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4583333432674408,
"rewards/xmlcount_reward_func": 0.48912499845027924,
"step": 103
},
{
"completion_length": 116.37500381469727,
"epoch": 0.055667068111869396,
"grad_norm": 0.39440569281578064,
"kl": 0.2107317578047514,
"learning_rate": 3.658240087799655e-06,
"loss": 0.0084,
"reward": 1.4635416865348816,
"reward_std": 0.3243303596973419,
"rewards/correctness_reward_func": 0.0833333358168602,
"rewards/int_reward_func": 0.4375000074505806,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4583333432674408,
"rewards/xmlcount_reward_func": 0.484375,
"step": 104
},
{
"completion_length": 115.41666984558105,
"epoch": 0.056202328382175835,
"grad_norm": 1.1680481433868408,
"kl": 0.4403987228870392,
"learning_rate": 3.627193851723577e-06,
"loss": 0.0176,
"reward": 1.2552083730697632,
"reward_std": 0.2559161148965359,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.3125000074505806,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4583333432674408,
"rewards/xmlcount_reward_func": 0.484375,
"step": 105
},
{
"completion_length": 69.83333396911621,
"epoch": 0.05673758865248227,
"grad_norm": 0.4940861463546753,
"kl": 0.27071962505578995,
"learning_rate": 3.595927866972694e-06,
"loss": 0.0108,
"reward": 1.1666666865348816,
"reward_std": 0.06454972922801971,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.1666666679084301,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 0.5,
"step": 106
},
{
"completion_length": 94.45833587646484,
"epoch": 0.05727284892278871,
"grad_norm": 1.4608356952667236,
"kl": 0.16904586926102638,
"learning_rate": 3.564448228912682e-06,
"loss": 0.0068,
"reward": 1.4895833432674408,
"reward_std": 0.4765794351696968,
"rewards/correctness_reward_func": 0.1666666716337204,
"rewards/int_reward_func": 0.4166666716337204,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.416666679084301,
"rewards/xmlcount_reward_func": 0.4895833432674408,
"step": 107
},
{
"completion_length": 130.75000190734863,
"epoch": 0.05780810919309514,
"grad_norm": 1.187828779220581,
"kl": 0.2347713652998209,
"learning_rate": 3.532761074561355e-06,
"loss": 0.0094,
"reward": 1.0625000298023224,
"reward_std": 0.11558076366782188,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.2500000037252903,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.3125,
"rewards/xmlcount_reward_func": 0.5,
"step": 108
},
{
"completion_length": 81.08333587646484,
"epoch": 0.05834336946340158,
"grad_norm": 0.4293256103992462,
"kl": 0.3195993173867464,
"learning_rate": 3.5008725813922383e-06,
"loss": 0.0128,
"reward": 1.1666666865348816,
"reward_std": 0.06454972922801971,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.1666666679084301,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 0.5,
"step": 109
},
{
"completion_length": 153.29166984558105,
"epoch": 0.05887862973370801,
"grad_norm": 1.0056191682815552,
"kl": 0.20048995688557625,
"learning_rate": 3.4687889661302577e-06,
"loss": 0.008,
"reward": 1.4843750298023224,
"reward_std": 0.41315262764692307,
"rewards/correctness_reward_func": 0.25,
"rewards/int_reward_func": 0.37500000558793545,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.37500000558793545,
"rewards/xmlcount_reward_func": 0.484375,
"step": 110
},
{
"completion_length": 88.54167175292969,
"epoch": 0.05941389000401445,
"grad_norm": 1.9036917686462402,
"kl": 0.2402002513408661,
"learning_rate": 3.436516483539781e-06,
"loss": 0.0096,
"reward": 1.833333358168602,
"reward_std": 0.5183059275150299,
"rewards/correctness_reward_func": 0.5000000223517418,
"rewards/int_reward_func": 0.39583333395421505,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4375000074505806,
"rewards/xmlcount_reward_func": 0.5,
"step": 111
},
{
"completion_length": 73.37500190734863,
"epoch": 0.059949150274320885,
"grad_norm": 1.473577857017517,
"kl": 0.31748900189995766,
"learning_rate": 3.4040614252052305e-06,
"loss": 0.0127,
"reward": 1.3645833730697632,
"reward_std": 0.3358423411846161,
"rewards/correctness_reward_func": 0.0833333358168602,
"rewards/int_reward_func": 0.3541666716337204,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4375,
"rewards/xmlcount_reward_func": 0.4895833358168602,
"step": 112
},
{
"completion_length": 75.04166793823242,
"epoch": 0.060484410544627325,
"grad_norm": 0.9336056709289551,
"kl": 0.29590417072176933,
"learning_rate": 3.3714301183045382e-06,
"loss": 0.0118,
"reward": 1.270833358168602,
"reward_std": 0.1530931033194065,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.3125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4583333432674408,
"rewards/xmlcount_reward_func": 0.5,
"step": 113
},
{
"completion_length": 71.25000190734863,
"epoch": 0.061019670814933764,
"grad_norm": 1.1127337217330933,
"kl": 0.2616447024047375,
"learning_rate": 3.338628924375638e-06,
"loss": 0.0105,
"reward": 1.291666716337204,
"reward_std": 0.16661179810762405,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.3125000149011612,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4791666716337204,
"rewards/xmlcount_reward_func": 0.5,
"step": 114
},
{
"completion_length": 75.50000286102295,
"epoch": 0.0615549310852402,
"grad_norm": 0.7082913517951965,
"kl": 0.2341964803636074,
"learning_rate": 3.3056642380762783e-06,
"loss": 0.0094,
"reward": 1.4791666865348816,
"reward_std": 0.3881191611289978,
"rewards/correctness_reward_func": 0.1666666716337204,
"rewards/int_reward_func": 0.3333333358168602,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4791666716337204,
"rewards/xmlcount_reward_func": 0.5,
"step": 115
},
{
"completion_length": 206.91667556762695,
"epoch": 0.06209019135554664,
"grad_norm": 0.41269803047180176,
"kl": 0.18453531339764595,
"learning_rate": 3.272542485937369e-06,
"loss": 0.0074,
"reward": 1.5364583730697632,
"reward_std": 0.7113124281167984,
"rewards/correctness_reward_func": 0.3333333432674408,
"rewards/int_reward_func": 0.4375000074505806,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.3125000037252903,
"rewards/xmlcount_reward_func": 0.453125,
"step": 116
},
{
"completion_length": 88.50000381469727,
"epoch": 0.06262545162585308,
"grad_norm": 0.8284731507301331,
"kl": 0.18175287544727325,
"learning_rate": 3.2392701251101172e-06,
"loss": 0.0073,
"reward": 1.3958333730697632,
"reward_std": 0.3092299550771713,
"rewards/correctness_reward_func": 0.1666666716337204,
"rewards/int_reward_func": 0.375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.3541666716337204,
"rewards/xmlcount_reward_func": 0.5,
"step": 117
},
{
"completion_length": 151.66666984558105,
"epoch": 0.0631607118961595,
"grad_norm": 1.4958237409591675,
"kl": 0.2391066513955593,
"learning_rate": 3.205853642107192e-06,
"loss": 0.0096,
"reward": 1.1510416865348816,
"reward_std": 0.5006890743970871,
"rewards/correctness_reward_func": 0.1666666716337204,
"rewards/int_reward_func": 0.22916666977107525,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.2708333432674408,
"rewards/xmlcount_reward_func": 0.484375,
"step": 118
},
{
"completion_length": 123.37500381469727,
"epoch": 0.06369597216646594,
"grad_norm": 0.7593151926994324,
"kl": 0.23146136105060577,
"learning_rate": 3.1722995515381644e-06,
"loss": 0.0093,
"reward": 1.4270833432674408,
"reward_std": 0.48038505017757416,
"rewards/correctness_reward_func": 0.1666666716337204,
"rewards/int_reward_func": 0.3750000037252903,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4166666716337204,
"rewards/xmlcount_reward_func": 0.46875,
"step": 119
},
{
"completion_length": 123.75000762939453,
"epoch": 0.06423123243677238,
"grad_norm": 0.3002820909023285,
"kl": 0.18683998472988605,
"learning_rate": 3.1386143948394764e-06,
"loss": 0.0075,
"reward": 1.3802083730697632,
"reward_std": 0.2934284619987011,
"rewards/correctness_reward_func": 0.4166666865348816,
"rewards/int_reward_func": 0.1041666716337204,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.375,
"rewards/xmlcount_reward_func": 0.484375,
"step": 120
},
{
"completion_length": 71.62500190734863,
"epoch": 0.06476649270707882,
"grad_norm": 0.894112229347229,
"kl": 0.2145114541053772,
"learning_rate": 3.1048047389991693e-06,
"loss": 0.0086,
"reward": 1.2291666865348816,
"reward_std": 0.05103103443980217,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.25,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4791666716337204,
"rewards/xmlcount_reward_func": 0.5,
"step": 121
},
{
"completion_length": 77.50000095367432,
"epoch": 0.06530175297738525,
"grad_norm": 0.9729853272438049,
"kl": 0.25685518980026245,
"learning_rate": 3.0708771752766397e-06,
"loss": 0.0103,
"reward": 1.645833358168602,
"reward_std": 0.25515517219901085,
"rewards/correctness_reward_func": 0.4166666865348816,
"rewards/int_reward_func": 0.25,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4791666716337204,
"rewards/xmlcount_reward_func": 0.5,
"step": 122
},
{
"completion_length": 115.25000381469727,
"epoch": 0.06583701324769169,
"grad_norm": 0.60560142993927,
"kl": 0.26780444383621216,
"learning_rate": 3.0368383179176584e-06,
"loss": 0.0107,
"reward": 1.3333333432674408,
"reward_std": 0.06454972922801971,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4583333358168602,
"rewards/xmlcount_reward_func": 0.5,
"step": 123
},
{
"completion_length": 65.25000190734863,
"epoch": 0.06637227351799813,
"grad_norm": 0.47688770294189453,
"kl": 0.2243332415819168,
"learning_rate": 3.002694802864912e-06,
"loss": 0.009,
"reward": 1.2291666865348816,
"reward_std": 0.05103103816509247,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.25,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4791666716337204,
"rewards/xmlcount_reward_func": 0.5,
"step": 124
},
{
"completion_length": 349.833345413208,
"epoch": 0.06690753378830457,
"grad_norm": 0.6750498414039612,
"kl": 0.27263053273782134,
"learning_rate": 2.9684532864643123e-06,
"loss": 0.0109,
"reward": 1.5260833650827408,
"reward_std": 0.4318152070045471,
"rewards/correctness_reward_func": 0.4166666865348816,
"rewards/int_reward_func": 0.3333333358168602,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.3541666716337204,
"rewards/xmlcount_reward_func": 0.421916663646698,
"step": 125
},
{
"completion_length": 186.62500762939453,
"epoch": 0.067442794058611,
"grad_norm": 0.8187605142593384,
"kl": 0.2676307410001755,
"learning_rate": 2.9341204441673267e-06,
"loss": 0.0107,
"reward": 1.55266672372818,
"reward_std": 0.4699760675430298,
"rewards/correctness_reward_func": 0.25,
"rewards/int_reward_func": 0.4166666716337204,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4375000074505806,
"rewards/xmlcount_reward_func": 0.44849999994039536,
"step": 126
},
{
"completion_length": 72.50000190734863,
"epoch": 0.06797805432891743,
"grad_norm": 0.06586437672376633,
"kl": 0.25656602531671524,
"learning_rate": 2.8997029692295875e-06,
"loss": 0.0103,
"reward": 1.375,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 0.5,
"step": 127
},
{
"completion_length": 67.66666889190674,
"epoch": 0.06851331459922387,
"grad_norm": 0.13435645401477814,
"kl": 0.23636912554502487,
"learning_rate": 2.8652075714060296e-06,
"loss": 0.0095,
"reward": 1.25,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.25,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 0.5,
"step": 128
},
{
"completion_length": 71.12500381469727,
"epoch": 0.06904857486953031,
"grad_norm": 0.6717380285263062,
"kl": 0.3070458807051182,
"learning_rate": 2.8306409756428067e-06,
"loss": 0.0123,
"reward": 1.4375000298023224,
"reward_std": 0.11558076366782188,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.4583333358168602,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4791666716337204,
"rewards/xmlcount_reward_func": 0.5,
"step": 129
},
{
"completion_length": 73.16666889190674,
"epoch": 0.06958383513983675,
"grad_norm": 0.9400418996810913,
"kl": 0.22653049230575562,
"learning_rate": 2.7960099207662535e-06,
"loss": 0.0091,
"reward": 2.0000000596046448,
"reward_std": 0.556085180491209,
"rewards/correctness_reward_func": 0.6666666865348816,
"rewards/int_reward_func": 0.375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4583333432674408,
"rewards/xmlcount_reward_func": 0.5,
"step": 130
},
{
"completion_length": 71.66666889190674,
"epoch": 0.07011909541014318,
"grad_norm": 0.6696528792381287,
"kl": 0.26699281856417656,
"learning_rate": 2.761321158169134e-06,
"loss": 0.0107,
"reward": 1.3958333730697632,
"reward_std": 0.25515518337488174,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.4583333432674408,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4375000149011612,
"rewards/xmlcount_reward_func": 0.5,
"step": 131
},
{
"completion_length": 98.91666984558105,
"epoch": 0.07065435568044962,
"grad_norm": 0.8539110422134399,
"kl": 0.23305394127964973,
"learning_rate": 2.726581450494451e-06,
"loss": 0.0093,
"reward": 1.1562500298023224,
"reward_std": 0.22562336921691895,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.2916666679084301,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.3750000037252903,
"rewards/xmlcount_reward_func": 0.4895833358168602,
"step": 132
},
{
"completion_length": 76.66666984558105,
"epoch": 0.07118961595075605,
"grad_norm": 0.2167089283466339,
"kl": 0.20909808576107025,
"learning_rate": 2.6917975703170466e-06,
"loss": 0.0084,
"reward": 1.25,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.25,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 0.5,
"step": 133
},
{
"completion_length": 69.50000190734863,
"epoch": 0.0717248762210625,
"grad_norm": 1.07498037815094,
"kl": 0.2273651361465454,
"learning_rate": 2.6569762988232838e-06,
"loss": 0.0091,
"reward": 1.2291666865348816,
"reward_std": 0.11558076366782188,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.22916666977107525,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 0.5,
"step": 134
},
{
"completion_length": 78.87500190734863,
"epoch": 0.07226013649136893,
"grad_norm": 1.082901954650879,
"kl": 0.21143031865358353,
"learning_rate": 2.6221244244890336e-06,
"loss": 0.0085,
"reward": 1.2691666781902313,
"reward_std": 0.16395629942417145,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4166666716337204,
"rewards/xmlcount_reward_func": 0.4775000065565109,
"step": 135
},
{
"completion_length": 84.79166793823242,
"epoch": 0.07279539676167536,
"grad_norm": 0.5148155093193054,
"kl": 0.19925828650593758,
"learning_rate": 2.587248741756253e-06,
"loss": 0.008,
"reward": 1.375,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 0.5,
"step": 136
},
{
"completion_length": 102.62500190734863,
"epoch": 0.0733306570319818,
"grad_norm": 0.6643544435501099,
"kl": 0.1562279723584652,
"learning_rate": 2.5523560497083927e-06,
"loss": 0.0062,
"reward": 1.248750001192093,
"reward_std": 0.2717357352375984,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.3333333432674408,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4375000149011612,
"rewards/xmlcount_reward_func": 0.4779166728258133,
"step": 137
},
{
"completion_length": 67.0416669845581,
"epoch": 0.07386591730228824,
"grad_norm": 0.4721544086933136,
"kl": 0.19591450318694115,
"learning_rate": 2.517453150744904e-06,
"loss": 0.0078,
"reward": 1.3541666865348816,
"reward_std": 0.05103103816509247,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4791666716337204,
"rewards/xmlcount_reward_func": 0.5,
"step": 138
},
{
"completion_length": 79.37500190734863,
"epoch": 0.07440117757259468,
"grad_norm": 0.4948488771915436,
"kl": 0.20949136465787888,
"learning_rate": 2.482546849255096e-06,
"loss": 0.0084,
"reward": 1.5416666865348816,
"reward_std": 0.25819891691207886,
"rewards/correctness_reward_func": 0.1666666716337204,
"rewards/int_reward_func": 0.375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 0.5,
"step": 139
},
{
"completion_length": 64.29166889190674,
"epoch": 0.0749364378429011,
"grad_norm": 0.8793404698371887,
"kl": 0.22966529056429863,
"learning_rate": 2.447643950291608e-06,
"loss": 0.0092,
"reward": 1.3541666865348816,
"reward_std": 0.25515519082546234,
"rewards/correctness_reward_func": 0.0833333358168602,
"rewards/int_reward_func": 0.27083333395421505,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 0.5,
"step": 140
},
{
"completion_length": 139.62500190734863,
"epoch": 0.07547169811320754,
"grad_norm": 0.5459631681442261,
"kl": 0.19374394416809082,
"learning_rate": 2.4127512582437486e-06,
"loss": 0.0077,
"reward": 1.177083358168602,
"reward_std": 0.18890930339694023,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.2916666716337204,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4166666679084301,
"rewards/xmlcount_reward_func": 0.46875,
"step": 141
},
{
"completion_length": 153.16666984558105,
"epoch": 0.07600695838351398,
"grad_norm": 0.9452311396598816,
"kl": 0.1499454267323017,
"learning_rate": 2.377875575510967e-06,
"loss": 0.006,
"reward": 1.7239583730697632,
"reward_std": 0.8880488127470016,
"rewards/correctness_reward_func": 0.666666679084301,
"rewards/int_reward_func": 0.3958333432674408,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.25,
"rewards/xmlcount_reward_func": 0.4114583358168602,
"step": 142
},
{
"completion_length": 61.29166793823242,
"epoch": 0.07654221865382042,
"grad_norm": 0.09438279271125793,
"kl": 0.3462410867214203,
"learning_rate": 2.3430237011767166e-06,
"loss": 0.0138,
"reward": 1.5,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 0.5,
"step": 143
},
{
"completion_length": 88.37500286102295,
"epoch": 0.07707747892412686,
"grad_norm": 1.7993136644363403,
"kl": 0.32219041138887405,
"learning_rate": 2.3082024296829538e-06,
"loss": 0.0129,
"reward": 1.4218750596046448,
"reward_std": 0.4565740302205086,
"rewards/correctness_reward_func": 0.1666666716337204,
"rewards/int_reward_func": 0.37500000558793545,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.3958333358168602,
"rewards/xmlcount_reward_func": 0.484375,
"step": 144
},
{
"completion_length": 72.62500381469727,
"epoch": 0.07761273919443329,
"grad_norm": 0.09042877703905106,
"kl": 0.20896168053150177,
"learning_rate": 2.2734185495055503e-06,
"loss": 0.0084,
"reward": 1.125,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 0.5,
"step": 145
},
{
"completion_length": 163.75000190734863,
"epoch": 0.07814799946473973,
"grad_norm": 1.1228388547897339,
"kl": 0.21238887682557106,
"learning_rate": 2.238678841830867e-06,
"loss": 0.0085,
"reward": 1.1927083730697632,
"reward_std": 0.2637527585029602,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.3333333432674408,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.3750000074505806,
"rewards/xmlcount_reward_func": 0.484375,
"step": 146
},
{
"completion_length": 71.95833683013916,
"epoch": 0.07868325973504617,
"grad_norm": 0.661372721195221,
"kl": 0.3126152493059635,
"learning_rate": 2.2039900792337477e-06,
"loss": 0.0125,
"reward": 1.4375,
"reward_std": 0.22008520364761353,
"rewards/correctness_reward_func": 0.0833333358168602,
"rewards/int_reward_func": 0.39583333395421505,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4583333358168602,
"rewards/xmlcount_reward_func": 0.5,
"step": 147
},
{
"completion_length": 72.33333492279053,
"epoch": 0.07921852000535261,
"grad_norm": 0.6126328706741333,
"kl": 0.18934645876288414,
"learning_rate": 2.1693590243571937e-06,
"loss": 0.0076,
"reward": 1.8333333432674408,
"reward_std": 0.20412413775920868,
"rewards/correctness_reward_func": 0.5833333358168602,
"rewards/int_reward_func": 0.25,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 0.5,
"step": 148
},
{
"completion_length": 103.37500381469727,
"epoch": 0.07975378027565903,
"grad_norm": 0.8722951412200928,
"kl": 0.14480971172451973,
"learning_rate": 2.134792428593971e-06,
"loss": 0.0058,
"reward": 1.895833432674408,
"reward_std": 0.5449064522981644,
"rewards/correctness_reward_func": 0.5000000149011612,
"rewards/int_reward_func": 0.4791666716337204,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4166666716337204,
"rewards/xmlcount_reward_func": 0.5,
"step": 149
},
{
"completion_length": 69.54166984558105,
"epoch": 0.08028904054596547,
"grad_norm": 0.7660404443740845,
"kl": 0.20250581949949265,
"learning_rate": 2.1002970307704134e-06,
"loss": 0.0081,
"reward": 1.3958333432674408,
"reward_std": 0.35721728205680847,
"rewards/correctness_reward_func": 0.1666666716337204,
"rewards/int_reward_func": 0.25,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4791666716337204,
"rewards/xmlcount_reward_func": 0.5,
"step": 150
},
{
"completion_length": 180.7083396911621,
"epoch": 0.08082430081627191,
"grad_norm": 0.5615190267562866,
"kl": 0.23339027352631092,
"learning_rate": 2.0658795558326745e-06,
"loss": 0.0093,
"reward": 1.5729166865348816,
"reward_std": 0.31237732619047165,
"rewards/correctness_reward_func": 0.4166666865348816,
"rewards/int_reward_func": 0.25,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4375,
"rewards/xmlcount_reward_func": 0.46875,
"step": 151
},
{
"completion_length": 63.83333396911621,
"epoch": 0.08135956108657835,
"grad_norm": 0.1392926722764969,
"kl": 0.26523152738809586,
"learning_rate": 2.031546713535688e-06,
"loss": 0.0106,
"reward": 1.375,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 0.5,
"step": 152
},
{
"completion_length": 81.08333778381348,
"epoch": 0.08189482135688479,
"grad_norm": 1.1122794151306152,
"kl": 0.25071796402335167,
"learning_rate": 1.997305197135089e-06,
"loss": 0.01,
"reward": 1.5200416892766953,
"reward_std": 0.3365423232316971,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.1875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.354166679084301,
"rewards/xmlcount_reward_func": 0.47837500274181366,
"step": 153
},
{
"completion_length": 93.37500381469727,
"epoch": 0.08243008162719122,
"grad_norm": 1.3372734785079956,
"kl": 0.19058941677212715,
"learning_rate": 1.963161682082342e-06,
"loss": 0.0076,
"reward": 1.6250000298023224,
"reward_std": 0.47279806435108185,
"rewards/correctness_reward_func": 0.5000000223517418,
"rewards/int_reward_func": 0.22916666977107525,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.39583333395421505,
"rewards/xmlcount_reward_func": 0.5,
"step": 154
},
{
"completion_length": 104.83333396911621,
"epoch": 0.08296534189749766,
"grad_norm": 2.2292211055755615,
"kl": 0.16792716644704342,
"learning_rate": 1.9291228247233607e-06,
"loss": 0.0067,
"reward": 2.0000000596046448,
"reward_std": 0.4289814233779907,
"rewards/correctness_reward_func": 0.583333358168602,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4166666679084301,
"rewards/xmlcount_reward_func": 0.5,
"step": 155
},
{
"completion_length": 90.87500476837158,
"epoch": 0.0835006021678041,
"grad_norm": 0.4122345745563507,
"kl": 0.18274019937962294,
"learning_rate": 1.895195261000831e-06,
"loss": 0.0073,
"reward": 1.2708333432674408,
"reward_std": 0.05103103443980217,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.27083333395421505,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 0.5,
"step": 156
},
{
"completion_length": 172.45833587646484,
"epoch": 0.08403586243811054,
"grad_norm": 0.8472205400466919,
"kl": 0.2094765491783619,
"learning_rate": 1.8613856051605242e-06,
"loss": 0.0084,
"reward": 1.2343750298023224,
"reward_std": 0.2048850804567337,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.3541666716337204,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.3958333395421505,
"rewards/xmlcount_reward_func": 0.484375,
"step": 157
},
{
"completion_length": 127.00000762939453,
"epoch": 0.08457112270841696,
"grad_norm": 0.6804733276367188,
"kl": 0.20304612442851067,
"learning_rate": 1.827700448461836e-06,
"loss": 0.0081,
"reward": 1.192708358168602,
"reward_std": 0.17030073329806328,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.25000000558793545,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4583333358168602,
"rewards/xmlcount_reward_func": 0.484375,
"step": 158
},
{
"completion_length": 85.29166793823242,
"epoch": 0.0851063829787234,
"grad_norm": 0.7838725447654724,
"kl": 0.29594049230217934,
"learning_rate": 1.7941463578928088e-06,
"loss": 0.0118,
"reward": 1.2708333432674408,
"reward_std": 0.16913224011659622,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.3541666716337204,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4375000074505806,
"rewards/xmlcount_reward_func": 0.4791666716337204,
"step": 159
},
{
"completion_length": 126.00000381469727,
"epoch": 0.08564164324902984,
"grad_norm": 1.384969711303711,
"kl": 0.16587615385651588,
"learning_rate": 1.7607298748898844e-06,
"loss": 0.0066,
"reward": 1.4843750149011612,
"reward_std": 0.5336930006742477,
"rewards/correctness_reward_func": 0.25,
"rewards/int_reward_func": 0.2916666716337204,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4583333432674408,
"rewards/xmlcount_reward_func": 0.484375,
"step": 160
},
{
"completion_length": 72.12500286102295,
"epoch": 0.08617690351933628,
"grad_norm": 0.0433061420917511,
"kl": 0.1850288063287735,
"learning_rate": 1.7274575140626318e-06,
"loss": 0.0074,
"reward": 1.125,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 0.5,
"step": 161
},
{
"completion_length": 174.58334159851074,
"epoch": 0.08671216378964271,
"grad_norm": 0.8271235823631287,
"kl": 0.18862449377775192,
"learning_rate": 1.6943357619237227e-06,
"loss": 0.0075,
"reward": 1.2083333432674408,
"reward_std": 0.30994437262415886,
"rewards/correctness_reward_func": 0.0833333358168602,
"rewards/int_reward_func": 0.3333333358168602,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.3333333432674408,
"rewards/xmlcount_reward_func": 0.4583333358168602,
"step": 162
},
{
"completion_length": 78.20833587646484,
"epoch": 0.08724742405994915,
"grad_norm": 0.908470869064331,
"kl": 0.1706334725022316,
"learning_rate": 1.661371075624363e-06,
"loss": 0.0068,
"reward": 1.2708333432674408,
"reward_std": 0.33958156406879425,
"rewards/correctness_reward_func": 0.0833333358168602,
"rewards/int_reward_func": 0.27083333395421505,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4166666716337204,
"rewards/xmlcount_reward_func": 0.5,
"step": 163
},
{
"completion_length": 137.91666984558105,
"epoch": 0.08778268433025559,
"grad_norm": 0.7333254814147949,
"kl": 0.1247784998267889,
"learning_rate": 1.6285698816954626e-06,
"loss": 0.005,
"reward": 1.5208333432674408,
"reward_std": 0.767971470952034,
"rewards/correctness_reward_func": 0.3333333432674408,
"rewards/int_reward_func": 0.29166667349636555,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.39583333395421505,
"rewards/xmlcount_reward_func": 0.5,
"step": 164
},
{
"completion_length": 78.08333683013916,
"epoch": 0.08831794460056203,
"grad_norm": 0.8487056493759155,
"kl": 0.22962494008243084,
"learning_rate": 1.5959385747947697e-06,
"loss": 0.0092,
"reward": 1.2916666865348816,
"reward_std": 0.14360667020082474,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.31250000186264515,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4791666716337204,
"rewards/xmlcount_reward_func": 0.5,
"step": 165
},
{
"completion_length": 200.79166793823242,
"epoch": 0.08885320487086847,
"grad_norm": 1.4497365951538086,
"kl": 0.1650528460741043,
"learning_rate": 1.56348351646022e-06,
"loss": 0.0066,
"reward": 1.4895834028720856,
"reward_std": 0.5747665874660015,
"rewards/correctness_reward_func": 0.1666666716337204,
"rewards/int_reward_func": 0.4375000074505806,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4166666716337204,
"rewards/xmlcount_reward_func": 0.46875,
"step": 166
},
{
"completion_length": 176.12500762939453,
"epoch": 0.08938846514117489,
"grad_norm": 0.4245186746120453,
"kl": 0.15275901928544044,
"learning_rate": 1.5312110338697427e-06,
"loss": 0.0061,
"reward": 1.1822916865348816,
"reward_std": 0.3088150769472122,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.3125000074505806,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4166666716337204,
"rewards/xmlcount_reward_func": 0.453125,
"step": 167
},
{
"completion_length": 312.9583396911621,
"epoch": 0.08992372541148133,
"grad_norm": 0.7641220092773438,
"kl": 0.14276206120848656,
"learning_rate": 1.4991274186077632e-06,
"loss": 0.0057,
"reward": 1.166666716337204,
"reward_std": 0.49721667170524597,
"rewards/correctness_reward_func": 0.0833333358168602,
"rewards/int_reward_func": 0.22916666977107525,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4166666679084301,
"rewards/xmlcount_reward_func": 0.4375,
"step": 168
},
{
"completion_length": 101.45833778381348,
"epoch": 0.09045898568178777,
"grad_norm": 0.7765936255455017,
"kl": 0.1949683390557766,
"learning_rate": 1.467238925438646e-06,
"loss": 0.0078,
"reward": 1.2916666865348816,
"reward_std": 0.28610818088054657,
"rewards/correctness_reward_func": 0.0833333358168602,
"rewards/int_reward_func": 0.3125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.3958333395421505,
"rewards/xmlcount_reward_func": 0.5,
"step": 169
},
{
"completion_length": 172.45833778381348,
"epoch": 0.09099424595209421,
"grad_norm": 0.41969409584999084,
"kl": 0.13137296214699745,
"learning_rate": 1.4355517710873184e-06,
"loss": 0.0053,
"reward": 2.114583343267441,
"reward_std": 0.5827288627624512,
"rewards/correctness_reward_func": 0.8333333358168602,
"rewards/int_reward_func": 0.4583333358168602,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.3541666716337204,
"rewards/xmlcount_reward_func": 0.46875,
"step": 170
},
{
"completion_length": 79.95833587646484,
"epoch": 0.09152950622240064,
"grad_norm": 0.16617827117443085,
"kl": 0.16961714625358582,
"learning_rate": 1.4040721330273063e-06,
"loss": 0.0068,
"reward": 1.25,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.25,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 0.5,
"step": 171
},
{
"completion_length": 80.87500190734863,
"epoch": 0.09206476649270708,
"grad_norm": 0.36631259322166443,
"kl": 0.1460169106721878,
"learning_rate": 1.3728061482764238e-06,
"loss": 0.0058,
"reward": 1.3333333432674408,
"reward_std": 0.06454972922801971,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.3333333358168602,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 0.5,
"step": 172
},
{
"completion_length": 97.12500190734863,
"epoch": 0.09260002676301352,
"grad_norm": 1.1213421821594238,
"kl": 0.14688214287161827,
"learning_rate": 1.3417599122003464e-06,
"loss": 0.0059,
"reward": 1.2291666865348816,
"reward_std": 0.05103103816509247,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.2291666716337204,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 0.5,
"step": 173
},
{
"completion_length": 85.58333587646484,
"epoch": 0.09313528703331996,
"grad_norm": 0.7063998579978943,
"kl": 0.17591003328561783,
"learning_rate": 1.3109394773243117e-06,
"loss": 0.007,
"reward": 1.3958333730697632,
"reward_std": 0.11558076739311218,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.3958333395421505,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 0.5,
"step": 174
},
{
"completion_length": 89.04166793823242,
"epoch": 0.0936705473036264,
"grad_norm": 0.7872856855392456,
"kl": 0.17240377515554428,
"learning_rate": 1.280350852153168e-06,
"loss": 0.0069,
"reward": 1.2500000447034836,
"reward_std": 0.2728445753455162,
"rewards/correctness_reward_func": 0.0833333358168602,
"rewards/int_reward_func": 0.2083333432674408,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4583333432674408,
"rewards/xmlcount_reward_func": 0.5,
"step": 175
},
{
"completion_length": 252.00000762939453,
"epoch": 0.09420580757393282,
"grad_norm": 1.0295542478561401,
"kl": 0.20453453436493874,
"learning_rate": 1.2500000000000007e-06,
"loss": 0.0082,
"reward": 1.3854167014360428,
"reward_std": 0.5866826139390469,
"rewards/correctness_reward_func": 0.1666666716337204,
"rewards/int_reward_func": 0.3958333432674408,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.35416666977107525,
"rewards/xmlcount_reward_func": 0.46875,
"step": 176
},
{
"completion_length": 147.75000381469727,
"epoch": 0.09474106784423926,
"grad_norm": 0.834882915019989,
"kl": 0.134497981518507,
"learning_rate": 1.2198928378235717e-06,
"loss": 0.0054,
"reward": 1.4427083730697632,
"reward_std": 0.5198761932551861,
"rewards/correctness_reward_func": 0.1666666716337204,
"rewards/int_reward_func": 0.37500000558793545,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4166666716337204,
"rewards/xmlcount_reward_func": 0.484375,
"step": 177
},
{
"completion_length": 100.08333778381348,
"epoch": 0.0952763281145457,
"grad_norm": 1.221563458442688,
"kl": 0.16201673820614815,
"learning_rate": 1.1900352350748026e-06,
"loss": 0.0065,
"reward": 1.338541716337204,
"reward_std": 0.2559161148965359,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.3958333395421505,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4583333432674408,
"rewards/xmlcount_reward_func": 0.484375,
"step": 178
},
{
"completion_length": 127.83333969116211,
"epoch": 0.09581158838485214,
"grad_norm": 0.886989176273346,
"kl": 0.15441275481134653,
"learning_rate": 1.160433012552508e-06,
"loss": 0.0062,
"reward": 0.9375000149011612,
"reward_std": 0.18744874745607376,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.3750000037252903,
"rewards/xmlcount_reward_func": 0.5,
"step": 179
},
{
"completion_length": 86.87500190734863,
"epoch": 0.09634684865515857,
"grad_norm": 1.1908172369003296,
"kl": 0.15353485196828842,
"learning_rate": 1.1310919412686248e-06,
"loss": 0.0061,
"reward": 1.0833333432674408,
"reward_std": 0.10206207260489464,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4583333432674408,
"rewards/xmlcount_reward_func": 0.5,
"step": 180
},
{
"completion_length": 89.58333587646484,
"epoch": 0.096882108925465,
"grad_norm": 1.44786536693573,
"kl": 0.1816324070096016,
"learning_rate": 1.1020177413231334e-06,
"loss": 0.0073,
"reward": 1.510416716337204,
"reward_std": 0.472514558583498,
"rewards/correctness_reward_func": 0.25,
"rewards/int_reward_func": 0.3750000074505806,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.3958333358168602,
"rewards/xmlcount_reward_func": 0.4895833358168602,
"step": 181
},
{
"completion_length": 82.58333778381348,
"epoch": 0.09741736919577144,
"grad_norm": 0.13121654093265533,
"kl": 0.18894518539309502,
"learning_rate": 1.073216080788921e-06,
"loss": 0.0076,
"reward": 1.25,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.25,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 0.5,
"step": 182
},
{
"completion_length": 221.0000123977661,
"epoch": 0.09795262946607788,
"grad_norm": 1.4435704946517944,
"kl": 0.18115888815373182,
"learning_rate": 1.0446925746067768e-06,
"loss": 0.0072,
"reward": 1.1875,
"reward_std": 0.28912585973739624,
"rewards/correctness_reward_func": 0.0833333358168602,
"rewards/int_reward_func": 0.2916666679084301,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.375,
"rewards/xmlcount_reward_func": 0.4375,
"step": 183
},
{
"completion_length": 125.9583387374878,
"epoch": 0.09848788973638432,
"grad_norm": 1.2450430393218994,
"kl": 0.14993033185601234,
"learning_rate": 1.0164527834907468e-06,
"loss": 0.006,
"reward": 1.6093750596046448,
"reward_std": 0.7003048211336136,
"rewards/correctness_reward_func": 0.583333358168602,
"rewards/int_reward_func": 0.1875000074505806,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.3541666716337204,
"rewards/xmlcount_reward_func": 0.484375,
"step": 184
},
{
"completion_length": 250.50000381469727,
"epoch": 0.09902315000669075,
"grad_norm": 0.6676502823829651,
"kl": 0.14374011009931564,
"learning_rate": 9.88502212844063e-07,
"loss": 0.0058,
"reward": 1.6502083837985992,
"reward_std": 0.4829741967841983,
"rewards/correctness_reward_func": 0.6666666716337204,
"rewards/int_reward_func": 0.2916666679084301,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.2708333395421505,
"rewards/xmlcount_reward_func": 0.42104167491197586,
"step": 185
},
{
"completion_length": 95.41666984558105,
"epoch": 0.09955841027699719,
"grad_norm": 0.14050611853599548,
"kl": 0.19362322241067886,
"learning_rate": 9.608463116858544e-07,
"loss": 0.0077,
"reward": 1.5,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.375,
"rewards/xmlcount_reward_func": 0.5,
"step": 186
},
{
"completion_length": 69.25000190734863,
"epoch": 0.10009367054730363,
"grad_norm": 0.7105045318603516,
"kl": 0.22476506605744362,
"learning_rate": 9.334904715888496e-07,
"loss": 0.009,
"reward": 1.2291666865348816,
"reward_std": 0.05103103443980217,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.2291666716337204,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 0.5,
"step": 187
},
{
"completion_length": 81.08333778381348,
"epoch": 0.10062893081761007,
"grad_norm": 1.2536767721176147,
"kl": 0.20191873610019684,
"learning_rate": 9.064400256282757e-07,
"loss": 0.0081,
"reward": 1.0000000298023224,
"reward_std": 0.20479072630405426,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.10416666977107525,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4166666716337204,
"rewards/xmlcount_reward_func": 0.4791666716337204,
"step": 188
},
{
"completion_length": 79.95833587646484,
"epoch": 0.1011641910879165,
"grad_norm": 0.1358516663312912,
"kl": 0.2150093950331211,
"learning_rate": 8.797002473421729e-07,
"loss": 0.0086,
"reward": 1.375,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 0.5,
"step": 189
},
{
"completion_length": 255.83334350585938,
"epoch": 0.10169945135822293,
"grad_norm": 0.8230968713760376,
"kl": 0.1948665827512741,
"learning_rate": 8.532763497032987e-07,
"loss": 0.0078,
"reward": 1.1041666865348816,
"reward_std": 0.3931647092103958,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.2500000074505806,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4166666716337204,
"rewards/xmlcount_reward_func": 0.4375,
"step": 190
},
{
"completion_length": 72.45833492279053,
"epoch": 0.10223471162852937,
"grad_norm": 0.6617278456687927,
"kl": 0.17371252551674843,
"learning_rate": 8.271734841028553e-07,
"loss": 0.0069,
"reward": 1.4166666865348816,
"reward_std": 0.4518480896949768,
"rewards/correctness_reward_func": 0.2500000074505806,
"rewards/int_reward_func": 0.1875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4791666716337204,
"rewards/xmlcount_reward_func": 0.5,
"step": 191
},
{
"completion_length": 92.29166889190674,
"epoch": 0.10276997189883581,
"grad_norm": 0.7337960600852966,
"kl": 0.18250929936766624,
"learning_rate": 8.013967393462094e-07,
"loss": 0.0073,
"reward": 1.4375000596046448,
"reward_std": 0.4927079305052757,
"rewards/correctness_reward_func": 0.25,
"rewards/int_reward_func": 0.2708333358168602,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4166666716337204,
"rewards/xmlcount_reward_func": 0.5,
"step": 192
},
{
"completion_length": 93.79166984558105,
"epoch": 0.10330523216914224,
"grad_norm": 1.3416187763214111,
"kl": 0.19713782332837582,
"learning_rate": 7.759511406608255e-07,
"loss": 0.0079,
"reward": 1.5156250596046448,
"reward_std": 0.5118480771780014,
"rewards/correctness_reward_func": 0.25,
"rewards/int_reward_func": 0.37500000558793545,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.3958333432674408,
"rewards/xmlcount_reward_func": 0.4947916716337204,
"step": 193
},
{
"completion_length": 96.08333969116211,
"epoch": 0.10384049243944868,
"grad_norm": 0.9250803589820862,
"kl": 0.21085454896092415,
"learning_rate": 7.508416487165862e-07,
"loss": 0.0084,
"reward": 1.8125000596046448,
"reward_std": 0.31970491632819176,
"rewards/correctness_reward_func": 0.4166666865348816,
"rewards/int_reward_func": 0.4166666679084301,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4791666716337204,
"rewards/xmlcount_reward_func": 0.5,
"step": 194
},
{
"completion_length": 74.00000190734863,
"epoch": 0.10437575270975512,
"grad_norm": 1.441701889038086,
"kl": 0.18313675373792648,
"learning_rate": 7.260731586586983e-07,
"loss": 0.0073,
"reward": 1.7500000298023224,
"reward_std": 0.3624359965324402,
"rewards/correctness_reward_func": 0.3333333432674408,
"rewards/int_reward_func": 0.4583333358168602,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4583333432674408,
"rewards/xmlcount_reward_func": 0.5,
"step": 195
},
{
"completion_length": 116.54167175292969,
"epoch": 0.10491101298006156,
"grad_norm": 1.0711389780044556,
"kl": 0.13384228572249413,
"learning_rate": 7.016504991533727e-07,
"loss": 0.0054,
"reward": 1.1666667014360428,
"reward_std": 0.16661180183291435,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.27083333395421505,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.3958333395421505,
"rewards/xmlcount_reward_func": 0.5,
"step": 196
},
{
"completion_length": 78.83333587646484,
"epoch": 0.105446273250368,
"grad_norm": 1.2510522603988647,
"kl": 0.18244327045977116,
"learning_rate": 6.775784314464717e-07,
"loss": 0.0073,
"reward": 1.0781250298023224,
"reward_std": 0.17936956882476807,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.10416666977107525,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4791666716337204,
"rewards/xmlcount_reward_func": 0.4947916716337204,
"step": 197
},
{
"completion_length": 201.83333492279053,
"epoch": 0.10598153352067442,
"grad_norm": 1.4353007078170776,
"kl": 0.17845631763339043,
"learning_rate": 6.538616484352902e-07,
"loss": 0.0071,
"reward": 1.4947916865348816,
"reward_std": 0.45137757435441017,
"rewards/correctness_reward_func": 0.3333333432674408,
"rewards/int_reward_func": 0.33333334140479565,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.375,
"rewards/xmlcount_reward_func": 0.453125,
"step": 198
},
{
"completion_length": 315.6666736602783,
"epoch": 0.10651679379098086,
"grad_norm": 0.7709922790527344,
"kl": 0.15007262770086527,
"learning_rate": 6.305047737536707e-07,
"loss": 0.006,
"reward": 1.3333333730697632,
"reward_std": 0.43266693875193596,
"rewards/correctness_reward_func": 0.0833333358168602,
"rewards/int_reward_func": 0.4166666679084301,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.3958333395421505,
"rewards/xmlcount_reward_func": 0.4375,
"step": 199
},
{
"completion_length": 226.25001335144043,
"epoch": 0.1070520540612873,
"grad_norm": 0.4058845043182373,
"kl": 0.24160834029316902,
"learning_rate": 6.075123608706093e-07,
"loss": 0.0097,
"reward": 1.6354166865348816,
"reward_std": 0.6109069883823395,
"rewards/correctness_reward_func": 0.3333333358168602,
"rewards/int_reward_func": 0.3958333358168602,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4375,
"rewards/xmlcount_reward_func": 0.46875,
"step": 200
},
{
"completion_length": 78.66666793823242,
"epoch": 0.10758731433159374,
"grad_norm": 0.5436657071113586,
"kl": 0.16075460240244865,
"learning_rate": 5.848888922025553e-07,
"loss": 0.0064,
"reward": 1.4375,
"reward_std": 0.22008520364761353,
"rewards/correctness_reward_func": 0.0833333358168602,
"rewards/int_reward_func": 0.3541666716337204,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 0.5,
"step": 201
},
{
"completion_length": 82.58333396911621,
"epoch": 0.10812257460190017,
"grad_norm": 0.7532062530517578,
"kl": 0.1826519127935171,
"learning_rate": 5.626387782395512e-07,
"loss": 0.0073,
"reward": 1.7916666865348816,
"reward_std": 0.20412413775920868,
"rewards/correctness_reward_func": 0.4166666865348816,
"rewards/int_reward_func": 0.375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 0.5,
"step": 202
},
{
"completion_length": 79.16666984558105,
"epoch": 0.10865783487220661,
"grad_norm": 1.078783631324768,
"kl": 0.1677638739347458,
"learning_rate": 5.407663566854008e-07,
"loss": 0.0067,
"reward": 1.4375000298023224,
"reward_std": 0.31970491632819176,
"rewards/correctness_reward_func": 0.0833333358168602,
"rewards/int_reward_func": 0.35416666977107525,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 0.5,
"step": 203
},
{
"completion_length": 92.33333396911621,
"epoch": 0.10919309514251305,
"grad_norm": 0.8154336214065552,
"kl": 0.18606754019856453,
"learning_rate": 5.192758916120236e-07,
"loss": 0.0074,
"reward": 1.9375000298023224,
"reward_std": 0.44672293961048126,
"rewards/correctness_reward_func": 0.5000000223517418,
"rewards/int_reward_func": 0.4375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 0.5,
"step": 204
},
{
"completion_length": 215.95834159851074,
"epoch": 0.10972835541281949,
"grad_norm": 0.8592817783355713,
"kl": 0.15069226268678904,
"learning_rate": 4.981715726281666e-07,
"loss": 0.006,
"reward": 1.2552083730697632,
"reward_std": 0.21166006475687027,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.3958333432674408,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.39583333395421505,
"rewards/xmlcount_reward_func": 0.4635416716337204,
"step": 205
},
{
"completion_length": 160.62500381469727,
"epoch": 0.11026361568312593,
"grad_norm": 1.0854992866516113,
"kl": 0.1672863345593214,
"learning_rate": 4.774575140626317e-07,
"loss": 0.0067,
"reward": 1.3802083730697632,
"reward_std": 0.4717924892902374,
"rewards/correctness_reward_func": 0.0833333358168602,
"rewards/int_reward_func": 0.4166666716337204,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.3958333358168602,
"rewards/xmlcount_reward_func": 0.484375,
"step": 206
},
{
"completion_length": 85.62500190734863,
"epoch": 0.11079887595343235,
"grad_norm": 2.937671184539795,
"kl": 0.22150231339037418,
"learning_rate": 4.5713775416217884e-07,
"loss": 0.0089,
"reward": 1.1458333730697632,
"reward_std": 0.3955717794597149,
"rewards/correctness_reward_func": 0.0833333358168602,
"rewards/int_reward_func": 0.18750000186264515,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.3750000074505806,
"rewards/xmlcount_reward_func": 0.5,
"step": 207
},
{
"completion_length": 86.54166889190674,
"epoch": 0.11133413622373879,
"grad_norm": 0.791983962059021,
"kl": 0.2634577229619026,
"learning_rate": 4.372162543042624e-07,
"loss": 0.0105,
"reward": 1.3541667014360428,
"reward_std": 0.31970490142703056,
"rewards/correctness_reward_func": 0.0833333358168602,
"rewards/int_reward_func": 0.3541666716337204,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4166666679084301,
"rewards/xmlcount_reward_func": 0.5,
"step": 208
},
{
"completion_length": 103.04166984558105,
"epoch": 0.11186939649404523,
"grad_norm": 0.7941485643386841,
"kl": 0.15498985722661018,
"learning_rate": 4.1769689822475147e-07,
"loss": 0.0062,
"reward": 1.2343750298023224,
"reward_std": 0.3260645717382431,
"rewards/correctness_reward_func": 0.0833333358168602,
"rewards/int_reward_func": 0.4375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.25000000186264515,
"rewards/xmlcount_reward_func": 0.4635416716337204,
"step": 209
},
{
"completion_length": 126.79166984558105,
"epoch": 0.11240465676435167,
"grad_norm": 1.0149474143981934,
"kl": 0.12467027455568314,
"learning_rate": 3.9858349126078945e-07,
"loss": 0.005,
"reward": 1.7271667420864105,
"reward_std": 0.5239234380424023,
"rewards/correctness_reward_func": 0.4166666865348816,
"rewards/int_reward_func": 0.4166666716337204,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.416666679084301,
"rewards/xmlcount_reward_func": 0.47716666758060455,
"step": 210
},
{
"completion_length": 85.79166984558105,
"epoch": 0.1129399170346581,
"grad_norm": 0.8396437764167786,
"kl": 0.18657264113426208,
"learning_rate": 3.798797596089351e-07,
"loss": 0.0075,
"reward": 1.5000000298023224,
"reward_std": 0.32274864614009857,
"rewards/correctness_reward_func": 0.1666666716337204,
"rewards/int_reward_func": 0.3333333358168602,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 0.5,
"step": 211
},
{
"completion_length": 108.00000381469727,
"epoch": 0.11347517730496454,
"grad_norm": 0.934079110622406,
"kl": 0.1696683205664158,
"learning_rate": 3.615893495987335e-07,
"loss": 0.0068,
"reward": 1.8125000298023224,
"reward_std": 0.6319277845323086,
"rewards/correctness_reward_func": 0.5000000149011612,
"rewards/int_reward_func": 0.3958333395421505,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.416666679084301,
"rewards/xmlcount_reward_func": 0.5,
"step": 212
},
{
"completion_length": 193.0416669845581,
"epoch": 0.11401043757527098,
"grad_norm": 0.6677061319351196,
"kl": 0.1520245149731636,
"learning_rate": 3.4371582698185636e-07,
"loss": 0.0061,
"reward": 1.713541716337204,
"reward_std": 0.5496542304754257,
"rewards/correctness_reward_func": 0.583333358168602,
"rewards/int_reward_func": 0.3333333358168602,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.3333333395421505,
"rewards/xmlcount_reward_func": 0.4635416716337204,
"step": 213
},
{
"completion_length": 86.66666984558105,
"epoch": 0.11454569784557742,
"grad_norm": 0.4593207538127899,
"kl": 0.1518435962498188,
"learning_rate": 3.262626762369525e-07,
"loss": 0.0061,
"reward": 1.0833333432674408,
"reward_std": 0.23273734748363495,
"rewards/correctness_reward_func": 0.0833333358168602,
"rewards/int_reward_func": 0.0625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4375,
"rewards/xmlcount_reward_func": 0.5,
"step": 214
},
{
"completion_length": 104.45833778381348,
"epoch": 0.11508095811588386,
"grad_norm": 0.42647111415863037,
"kl": 0.21028569713234901,
"learning_rate": 3.092332998903416e-07,
"loss": 0.0084,
"reward": 1.6458333730697632,
"reward_std": 0.2753772810101509,
"rewards/correctness_reward_func": 0.4166666865348816,
"rewards/int_reward_func": 0.4166666679084301,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.3333333358168602,
"rewards/xmlcount_reward_func": 0.4791666716337204,
"step": 215
},
{
"completion_length": 325.2500114440918,
"epoch": 0.11561621838619028,
"grad_norm": 0.5365056991577148,
"kl": 0.134639460593462,
"learning_rate": 2.9263101785268253e-07,
"loss": 0.0054,
"reward": 1.604166679084301,
"reward_std": 0.14088044688105583,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.2916666679084301,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.375,
"rewards/xmlcount_reward_func": 0.4375,
"step": 216
},
{
"completion_length": 89.54166793823242,
"epoch": 0.11615147865649672,
"grad_norm": 0.6397629976272583,
"kl": 0.21499066427350044,
"learning_rate": 2.764590667717562e-07,
"loss": 0.0086,
"reward": 1.5,
"reward_std": 0.19364917278289795,
"rewards/correctness_reward_func": 0.0833333358168602,
"rewards/int_reward_func": 0.4583333358168602,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4583333358168602,
"rewards/xmlcount_reward_func": 0.5,
"step": 217
},
{
"completion_length": 85.50000095367432,
"epoch": 0.11668673892680316,
"grad_norm": 1.3944755792617798,
"kl": 0.13307987339794636,
"learning_rate": 2.6072059940146775e-07,
"loss": 0.0053,
"reward": 1.541666716337204,
"reward_std": 0.5103103779256344,
"rewards/correctness_reward_func": 0.1666666716337204,
"rewards/int_reward_func": 0.39583333395421505,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4791666716337204,
"rewards/xmlcount_reward_func": 0.5,
"step": 218
},
{
"completion_length": 80.66666793823242,
"epoch": 0.1172219991971096,
"grad_norm": 0.08477512001991272,
"kl": 0.16065896674990654,
"learning_rate": 2.454186839872158e-07,
"loss": 0.0064,
"reward": 1.125,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 0.5,
"step": 219
},
{
"completion_length": 72.79166793823242,
"epoch": 0.11775725946741603,
"grad_norm": 0.6473060250282288,
"kl": 0.2047443389892578,
"learning_rate": 2.3055630366772857e-07,
"loss": 0.0082,
"reward": 1.6666666865348816,
"reward_std": 0.20412413775920868,
"rewards/correctness_reward_func": 0.4166666865348816,
"rewards/int_reward_func": 0.25,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 0.5,
"step": 220
},
{
"completion_length": 255.87500381469727,
"epoch": 0.11829251973772247,
"grad_norm": 1.165757656097412,
"kl": 0.34885890036821365,
"learning_rate": 2.1613635589349756e-07,
"loss": 0.014,
"reward": 1.2083333432674408,
"reward_std": 0.5354874432086945,
"rewards/correctness_reward_func": 0.0833333358168602,
"rewards/int_reward_func": 0.3125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.3750000074505806,
"rewards/xmlcount_reward_func": 0.4375,
"step": 221
},
{
"completion_length": 68.04166889190674,
"epoch": 0.1188277800080289,
"grad_norm": 1.5734628438949585,
"kl": 0.35767246037721634,
"learning_rate": 2.0216165186191406e-07,
"loss": 0.0143,
"reward": 1.729166716337204,
"reward_std": 0.4259376786649227,
"rewards/correctness_reward_func": 0.5000000223517418,
"rewards/int_reward_func": 0.25000000558793545,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4791666716337204,
"rewards/xmlcount_reward_func": 0.5,
"step": 222
},
{
"completion_length": 93.95833587646484,
"epoch": 0.11936304027833534,
"grad_norm": 0.3049551248550415,
"kl": 0.14186285808682442,
"learning_rate": 1.8863491596921745e-07,
"loss": 0.0057,
"reward": 1.375,
"reward_std": 0.25,
"rewards/correctness_reward_func": 0.0833333358168602,
"rewards/int_reward_func": 0.375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4166666679084301,
"rewards/xmlcount_reward_func": 0.5,
"step": 223
},
{
"completion_length": 170.7500057220459,
"epoch": 0.11989830054864177,
"grad_norm": 0.8757752180099487,
"kl": 0.17503754422068596,
"learning_rate": 1.7555878527937164e-07,
"loss": 0.007,
"reward": 1.1865417063236237,
"reward_std": 0.413798563182354,
"rewards/correctness_reward_func": 0.0833333358168602,
"rewards/int_reward_func": 0.2291666716337204,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.3958333358168602,
"rewards/xmlcount_reward_func": 0.4782083332538605,
"step": 224
},
{
"completion_length": 191.8333396911621,
"epoch": 0.12043356081894821,
"grad_norm": 1.3906254768371582,
"kl": 0.11161109246313572,
"learning_rate": 1.629358090099639e-07,
"loss": 0.0045,
"reward": 0.96875,
"reward_std": 0.29255440831184387,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.1875000074505806,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.3125000037252903,
"rewards/xmlcount_reward_func": 0.46875,
"step": 225
},
{
"completion_length": 150.75000762939453,
"epoch": 0.12096882108925465,
"grad_norm": 0.7032321691513062,
"kl": 0.23093389347195625,
"learning_rate": 1.507684480352292e-07,
"loss": 0.0092,
"reward": 1.651041716337204,
"reward_std": 0.36715345084667206,
"rewards/correctness_reward_func": 0.3333333432674408,
"rewards/int_reward_func": 0.4375000074505806,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.39583333395421505,
"rewards/xmlcount_reward_func": 0.484375,
"step": 226
},
{
"completion_length": 173.04166793823242,
"epoch": 0.12150408135956109,
"grad_norm": 0.777036726474762,
"kl": 0.22725828364491463,
"learning_rate": 1.3905907440629752e-07,
"loss": 0.0091,
"reward": 1.333333358168602,
"reward_std": 0.3425312591716647,
"rewards/correctness_reward_func": 0.0833333358168602,
"rewards/int_reward_func": 0.3750000037252903,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.3958333358168602,
"rewards/xmlcount_reward_func": 0.4791666716337204,
"step": 227
},
{
"completion_length": 114.66667079925537,
"epoch": 0.12203934162986753,
"grad_norm": 2.417041778564453,
"kl": 0.15211265347898006,
"learning_rate": 1.278099708887587e-07,
"loss": 0.0061,
"reward": 2.067708373069763,
"reward_std": 0.8522266149520874,
"rewards/correctness_reward_func": 0.6666666716337204,
"rewards/int_reward_func": 0.4791666716337204,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4375,
"rewards/xmlcount_reward_func": 0.484375,
"step": 228
},
{
"completion_length": 71.45833587646484,
"epoch": 0.12257460190017395,
"grad_norm": 1.4324641227722168,
"kl": 0.2629435919225216,
"learning_rate": 1.1702333051763271e-07,
"loss": 0.0105,
"reward": 1.4375000149011612,
"reward_std": 0.4443886801600456,
"rewards/correctness_reward_func": 0.25,
"rewards/int_reward_func": 0.20833333395421505,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4791666716337204,
"rewards/xmlcount_reward_func": 0.5,
"step": 229
},
{
"completion_length": 265.95834159851074,
"epoch": 0.1231098621704804,
"grad_norm": 0.49865278601646423,
"kl": 0.08768011070787907,
"learning_rate": 1.067012561698319e-07,
"loss": 0.0035,
"reward": 0.8750000149011612,
"reward_std": 0.42222120985388756,
"rewards/correctness_reward_func": 0.0833333358168602,
"rewards/int_reward_func": 0.02083333395421505,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.3333333432674408,
"rewards/xmlcount_reward_func": 0.4375,
"step": 230
},
{
"completion_length": 365.291672706604,
"epoch": 0.12364512244078683,
"grad_norm": 0.5076226592063904,
"kl": 0.13591468706727028,
"learning_rate": 9.684576015420277e-08,
"loss": 0.0054,
"reward": 1.083333358168602,
"reward_std": 0.49768130481243134,
"rewards/correctness_reward_func": 0.1666666716337204,
"rewards/int_reward_func": 0.25,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.27083333395421505,
"rewards/xmlcount_reward_func": 0.3958333358168602,
"step": 231
},
{
"completion_length": 95.41666793823242,
"epoch": 0.12418038271109327,
"grad_norm": 1.2022953033447266,
"kl": 0.14907664991915226,
"learning_rate": 8.745876381922147e-08,
"loss": 0.006,
"reward": 1.3697917461395264,
"reward_std": 0.4918174706399441,
"rewards/correctness_reward_func": 0.1666666716337204,
"rewards/int_reward_func": 0.31250000186264515,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4166666716337204,
"rewards/xmlcount_reward_func": 0.4739583358168602,
"step": 232
},
{
"completion_length": 113.79166793823242,
"epoch": 0.1247156429813997,
"grad_norm": 0.5673840641975403,
"kl": 0.26192033290863037,
"learning_rate": 7.854209717842231e-08,
"loss": 0.0105,
"reward": 1.3125000149011612,
"reward_std": 0.15864631533622742,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.37500000558793545,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4375000074505806,
"rewards/xmlcount_reward_func": 0.5,
"step": 233
},
{
"completion_length": 112.12500190734863,
"epoch": 0.12525090325170615,
"grad_norm": 1.3854151964187622,
"kl": 0.17152241989970207,
"learning_rate": 7.009749855363457e-08,
"loss": 0.0069,
"reward": 1.4375000596046448,
"reward_std": 0.4543575756251812,
"rewards/correctness_reward_func": 0.1666666716337204,
"rewards/int_reward_func": 0.33333333395421505,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4375000074505806,
"rewards/xmlcount_reward_func": 0.5,
"step": 234
},
{
"completion_length": 98.75000381469727,
"epoch": 0.12578616352201258,
"grad_norm": 0.790249764919281,
"kl": 0.14792344719171524,
"learning_rate": 6.212661423609184e-08,
"loss": 0.0059,
"reward": 1.7291666865348816,
"reward_std": 0.5564306005835533,
"rewards/correctness_reward_func": 0.3333333432674408,
"rewards/int_reward_func": 0.4791666716337204,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4166666716337204,
"rewards/xmlcount_reward_func": 0.5,
"step": 235
},
{
"completion_length": 144.8333396911621,
"epoch": 0.126321423792319,
"grad_norm": 0.2076501101255417,
"kl": 0.18184524960815907,
"learning_rate": 5.463099816548578e-08,
"loss": 0.0073,
"reward": 1.4479166865348816,
"reward_std": 0.2685803771018982,
"rewards/correctness_reward_func": 0.0833333358168602,
"rewards/int_reward_func": 0.4583333358168602,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4375,
"rewards/xmlcount_reward_func": 0.46875,
"step": 236
},
{
"completion_length": 142.12500190734863,
"epoch": 0.12685668406262546,
"grad_norm": 1.9539458751678467,
"kl": 0.24447984993457794,
"learning_rate": 4.761211162702117e-08,
"loss": 0.0098,
"reward": 1.2968750596046448,
"reward_std": 0.29226116091012955,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.4791666716337204,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.3333333469927311,
"rewards/xmlcount_reward_func": 0.484375,
"step": 237
},
{
"completion_length": 72.37500381469727,
"epoch": 0.12739194433293188,
"grad_norm": 1.3912091255187988,
"kl": 0.29350727051496506,
"learning_rate": 4.1071322966535487e-08,
"loss": 0.0117,
"reward": 1.291666716337204,
"reward_std": 0.2096773497760296,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.3125000074505806,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4791666716337204,
"rewards/xmlcount_reward_func": 0.5,
"step": 238
},
{
"completion_length": 82.41667175292969,
"epoch": 0.12792720460323834,
"grad_norm": 0.43811848759651184,
"kl": 0.202481709420681,
"learning_rate": 3.5009907323737826e-08,
"loss": 0.0081,
"reward": 1.2083333432674408,
"reward_std": 0.06454972922801971,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.2291666716337204,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4791666716337204,
"rewards/xmlcount_reward_func": 0.5,
"step": 239
},
{
"completion_length": 85.45833587646484,
"epoch": 0.12846246487354476,
"grad_norm": 0.8119534850120544,
"kl": 0.23333512246608734,
"learning_rate": 2.9429046383618042e-08,
"loss": 0.0093,
"reward": 1.1666666865348816,
"reward_std": 0.11949635669589043,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.1666666716337204,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 0.5,
"step": 240
},
{
"completion_length": 160.91666984558105,
"epoch": 0.1289977251438512,
"grad_norm": 1.1854231357574463,
"kl": 0.15807193890213966,
"learning_rate": 2.4329828146074096e-08,
"loss": 0.0063,
"reward": 0.8219999894499779,
"reward_std": 0.6791375987231731,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.1875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4375000074505806,
"rewards/xmlcount_reward_func": 0.19699998199939728,
"step": 241
},
{
"completion_length": 72.75000190734863,
"epoch": 0.12953298541415764,
"grad_norm": 0.45305031538009644,
"kl": 0.20153097435832024,
"learning_rate": 1.9713246713805588e-08,
"loss": 0.0081,
"reward": 1.3125,
"reward_std": 0.22008520364761353,
"rewards/correctness_reward_func": 0.0833333358168602,
"rewards/int_reward_func": 0.2291666716337204,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 0.5,
"step": 242
},
{
"completion_length": 152.1666717529297,
"epoch": 0.13006824568446407,
"grad_norm": 0.4157455861568451,
"kl": 0.1709270216524601,
"learning_rate": 1.5580202098509078e-08,
"loss": 0.0068,
"reward": 1.4427083432674408,
"reward_std": 0.345734566450119,
"rewards/correctness_reward_func": 0.0833333358168602,
"rewards/int_reward_func": 0.4583333432674408,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4166666679084301,
"rewards/xmlcount_reward_func": 0.484375,
"step": 243
},
{
"completion_length": 72.79166889190674,
"epoch": 0.1306035059547705,
"grad_norm": 1.6172090768814087,
"kl": 0.1881002075970173,
"learning_rate": 1.193150004542204e-08,
"loss": 0.0075,
"reward": 1.4322916865348816,
"reward_std": 0.26791293919086456,
"rewards/correctness_reward_func": 0.0833333358168602,
"rewards/int_reward_func": 0.375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4791666716337204,
"rewards/xmlcount_reward_func": 0.4947916716337204,
"step": 244
},
{
"completion_length": 63.541666984558105,
"epoch": 0.13113876622507695,
"grad_norm": 0.6279511451721191,
"kl": 0.26533466950058937,
"learning_rate": 8.767851876239075e-09,
"loss": 0.0106,
"reward": 1.2708333432674408,
"reward_std": 0.05103103816509247,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.27083333395421505,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 0.5,
"step": 245
},
{
"completion_length": 188.20833587646484,
"epoch": 0.13167402649538337,
"grad_norm": 0.917432963848114,
"kl": 0.16238786652684212,
"learning_rate": 6.089874350439507e-09,
"loss": 0.0065,
"reward": 1.6145834028720856,
"reward_std": 0.7706367075443268,
"rewards/correctness_reward_func": 0.3333333432674408,
"rewards/int_reward_func": 0.3958333432674408,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4166666716337204,
"rewards/xmlcount_reward_func": 0.46875,
"step": 246
},
{
"completion_length": 102.25000381469727,
"epoch": 0.13220928676568983,
"grad_norm": 0.6094866394996643,
"kl": 0.1517469845712185,
"learning_rate": 3.8980895450474455e-09,
"loss": 0.0061,
"reward": 1.2708333432674408,
"reward_std": 0.27258947491645813,
"rewards/correctness_reward_func": 0.0833333358168602,
"rewards/int_reward_func": 0.2291666716337204,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4583333358168602,
"rewards/xmlcount_reward_func": 0.5,
"step": 247
},
{
"completion_length": 265.0000114440918,
"epoch": 0.13274454703599625,
"grad_norm": 1.4663077592849731,
"kl": 0.1445157825946808,
"learning_rate": 2.192924752854042e-09,
"loss": 0.0058,
"reward": 0.9218750074505806,
"reward_std": 0.21220333129167557,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.18750000186264515,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.3125,
"rewards/xmlcount_reward_func": 0.421875,
"step": 248
},
{
"completion_length": 78.83333587646484,
"epoch": 0.13327980730630268,
"grad_norm": 1.0312261581420898,
"kl": 0.16930826753377914,
"learning_rate": 9.747123991141193e-10,
"loss": 0.0068,
"reward": 1.229166716337204,
"reward_std": 0.1530931033194065,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.25000000558793545,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4791666716337204,
"rewards/xmlcount_reward_func": 0.5,
"step": 249
},
{
"completion_length": 271.9166793823242,
"epoch": 0.13381506757660913,
"grad_norm": 0.3518020808696747,
"kl": 0.12262176536023617,
"learning_rate": 2.43689976739403e-10,
"loss": 0.0049,
"reward": 1.5572916716337204,
"reward_std": 0.25069504231214523,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.20833333395421505,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.3958333395421505,
"rewards/xmlcount_reward_func": 0.453125,
"step": 250
}
],
"logging_steps": 1,
"max_steps": 250,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 250,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}