|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.13381506757660913, |
|
"eval_steps": 500, |
|
"global_step": 250, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 281.1666851043701, |
|
"epoch": 0.0005352602703064365, |
|
"grad_norm": 0.5917035937309265, |
|
"kl": 0.0, |
|
"learning_rate": 0.0, |
|
"loss": -0.0, |
|
"reward": -0.10487502068281174, |
|
"reward_std": 0.644918380305171, |
|
"rewards/correctness_reward_func": 0.1666666716337204, |
|
"rewards/int_reward_func": 0.1041666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.37570834904909134, |
|
"step": 1 |
|
}, |
|
{ |
|
"completion_length": 590.8750171661377, |
|
"epoch": 0.001070520540612873, |
|
"grad_norm": 0.7477704286575317, |
|
"kl": 0.0, |
|
"learning_rate": 2.0000000000000002e-07, |
|
"loss": 0.0, |
|
"reward": 0.22162500163540244, |
|
"reward_std": 0.09485530573874712, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.125, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.09662500163540244, |
|
"step": 2 |
|
}, |
|
{ |
|
"completion_length": 539.6666870117188, |
|
"epoch": 0.0016057808109193096, |
|
"grad_norm": 0.4429571032524109, |
|
"kl": 0.002077269156870898, |
|
"learning_rate": 4.0000000000000003e-07, |
|
"loss": 0.0001, |
|
"reward": 0.016208335757255554, |
|
"reward_std": 0.6246479228138924, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.2083333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.1921250014565885, |
|
"step": 3 |
|
}, |
|
{ |
|
"completion_length": 185.7916717529297, |
|
"epoch": 0.002141041081225746, |
|
"grad_norm": 0.6855182647705078, |
|
"kl": 0.0009879921708488837, |
|
"learning_rate": 6.000000000000001e-07, |
|
"loss": 0.0, |
|
"reward": 1.0669583305716515, |
|
"reward_std": 0.5203845072537661, |
|
"rewards/correctness_reward_func": 0.7500000298023224, |
|
"rewards/int_reward_func": 0.3541666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.03720833268016577, |
|
"step": 4 |
|
}, |
|
{ |
|
"completion_length": 587.3750076293945, |
|
"epoch": 0.0026763013515321826, |
|
"grad_norm": 0.7880218029022217, |
|
"kl": 0.0007925744503154419, |
|
"learning_rate": 8.000000000000001e-07, |
|
"loss": 0.0, |
|
"reward": 0.09533333079889417, |
|
"reward_std": 0.5927679911255836, |
|
"rewards/correctness_reward_func": 0.0833333358168602, |
|
"rewards/int_reward_func": 0.1041666679084301, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.09216667525470257, |
|
"step": 5 |
|
}, |
|
{ |
|
"completion_length": 245.70834159851074, |
|
"epoch": 0.003211561621838619, |
|
"grad_norm": 0.9220851063728333, |
|
"kl": 0.001330614773905836, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.0001, |
|
"reward": 0.08295834437012672, |
|
"reward_std": 0.5829638005234301, |
|
"rewards/correctness_reward_func": 0.0833333358168602, |
|
"rewards/int_reward_func": 0.0416666679084301, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.04204164445400238, |
|
"step": 6 |
|
}, |
|
{ |
|
"completion_length": 448.7083549499512, |
|
"epoch": 0.0037468218921450553, |
|
"grad_norm": 0.8303655385971069, |
|
"kl": 0.0012542481999844313, |
|
"learning_rate": 1.2000000000000002e-06, |
|
"loss": 0.0001, |
|
"reward": 0.03099999949336052, |
|
"reward_std": 0.7598095312714577, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.16666666977107525, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.13566668145358562, |
|
"step": 7 |
|
}, |
|
{ |
|
"completion_length": 197.95833587646484, |
|
"epoch": 0.004282082162451492, |
|
"grad_norm": 1.0582274198532104, |
|
"kl": 0.0008733256690902635, |
|
"learning_rate": 1.4000000000000001e-06, |
|
"loss": 0.0, |
|
"reward": 0.12341666966676712, |
|
"reward_std": 0.09142086654901505, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.12341666780412197, |
|
"step": 8 |
|
}, |
|
{ |
|
"completion_length": 176.41666984558105, |
|
"epoch": 0.004817342432757929, |
|
"grad_norm": 1.0369133949279785, |
|
"kl": 0.005098502180771902, |
|
"learning_rate": 1.6000000000000001e-06, |
|
"loss": 0.0002, |
|
"reward": 0.32220835238695145, |
|
"reward_std": 0.3896215371787548, |
|
"rewards/correctness_reward_func": 0.0833333358168602, |
|
"rewards/int_reward_func": 0.125, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.11387500539422035, |
|
"step": 9 |
|
}, |
|
{ |
|
"completion_length": 267.66667556762695, |
|
"epoch": 0.005352602703064365, |
|
"grad_norm": 0.4552709460258484, |
|
"kl": 0.001112774269131478, |
|
"learning_rate": 1.8000000000000001e-06, |
|
"loss": 0.0, |
|
"reward": 0.31854166090488434, |
|
"reward_std": 0.3717608004808426, |
|
"rewards/correctness_reward_func": 0.1666666716337204, |
|
"rewards/int_reward_func": 0.1250000037252903, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.02687499998137355, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 319.58333587646484, |
|
"epoch": 0.005887862973370801, |
|
"grad_norm": 0.7992357015609741, |
|
"kl": 0.0012863876472692937, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 0.0001, |
|
"reward": 0.2948333490639925, |
|
"reward_std": 0.38963131979107857, |
|
"rewards/correctness_reward_func": 0.0833333358168602, |
|
"rewards/int_reward_func": 0.10416666977107525, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.10733332857489586, |
|
"step": 11 |
|
}, |
|
{ |
|
"completion_length": 375.29166984558105, |
|
"epoch": 0.006423123243677238, |
|
"grad_norm": 0.965032160282135, |
|
"kl": 0.0008803782802715432, |
|
"learning_rate": 2.2e-06, |
|
"loss": 0.0, |
|
"reward": 1.012750007212162, |
|
"reward_std": 0.5616761147975922, |
|
"rewards/correctness_reward_func": 0.8333333432674408, |
|
"rewards/int_reward_func": 0.22916666977107525, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.04975000023841858, |
|
"step": 12 |
|
}, |
|
{ |
|
"completion_length": 482.4583435058594, |
|
"epoch": 0.0069583835139836745, |
|
"grad_norm": 1.502334713935852, |
|
"kl": 0.0028791724907932803, |
|
"learning_rate": 2.4000000000000003e-06, |
|
"loss": 0.0001, |
|
"reward": -0.32758333161473274, |
|
"reward_std": 0.7293146029114723, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.02083333395421505, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.34841667115688324, |
|
"step": 13 |
|
}, |
|
{ |
|
"completion_length": 236.41667366027832, |
|
"epoch": 0.007493643784290111, |
|
"grad_norm": 0.8761454820632935, |
|
"kl": 0.0018893379892688245, |
|
"learning_rate": 2.6e-06, |
|
"loss": 0.0001, |
|
"reward": -0.017458327114582062, |
|
"reward_std": 0.31026666425168514, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.06250000186264515, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.07995832804590464, |
|
"step": 14 |
|
}, |
|
{ |
|
"completion_length": 306.6250104904175, |
|
"epoch": 0.008028904054596548, |
|
"grad_norm": 1.070983648300171, |
|
"kl": 0.0017425262776669115, |
|
"learning_rate": 2.8000000000000003e-06, |
|
"loss": 0.0001, |
|
"reward": -0.08808333426713943, |
|
"reward_std": 0.37953382171690464, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.08808333426713943, |
|
"step": 15 |
|
}, |
|
{ |
|
"completion_length": 366.6666793823242, |
|
"epoch": 0.008564164324902984, |
|
"grad_norm": 0.6342064738273621, |
|
"kl": 0.0009713478648336604, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.03991668112576008, |
|
"reward_std": 0.8613052181899548, |
|
"rewards/correctness_reward_func": 0.1666666716337204, |
|
"rewards/int_reward_func": 0.0416666679084301, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.16841666959226131, |
|
"step": 16 |
|
}, |
|
{ |
|
"completion_length": 297.75000381469727, |
|
"epoch": 0.00909942459520942, |
|
"grad_norm": 0.629483163356781, |
|
"kl": 0.0008868449804140255, |
|
"learning_rate": 3.2000000000000003e-06, |
|
"loss": 0.0, |
|
"reward": 1.0445833504199982, |
|
"reward_std": 0.3152644243091345, |
|
"rewards/correctness_reward_func": 0.6666666716337204, |
|
"rewards/int_reward_func": 0.3333333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.044583337381482124, |
|
"step": 17 |
|
}, |
|
{ |
|
"completion_length": 185.79166793823242, |
|
"epoch": 0.009634684865515858, |
|
"grad_norm": 1.0232924222946167, |
|
"kl": 0.0012610588310053572, |
|
"learning_rate": 3.4000000000000005e-06, |
|
"loss": 0.0001, |
|
"reward": 0.21658334136009216, |
|
"reward_std": 0.18211832642555237, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0625, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.15408333763480186, |
|
"step": 18 |
|
}, |
|
{ |
|
"completion_length": 361.5416679382324, |
|
"epoch": 0.010169945135822294, |
|
"grad_norm": 0.8113482594490051, |
|
"kl": 0.0015902465383987874, |
|
"learning_rate": 3.6000000000000003e-06, |
|
"loss": 0.0001, |
|
"reward": 0.44666668586432934, |
|
"reward_std": 0.5656739473342896, |
|
"rewards/correctness_reward_func": 0.2500000074505806, |
|
"rewards/int_reward_func": 0.18750000558793545, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.009166665724478662, |
|
"step": 19 |
|
}, |
|
{ |
|
"completion_length": 254.66666984558105, |
|
"epoch": 0.01070520540612873, |
|
"grad_norm": 1.1768397092819214, |
|
"kl": 0.004655150449252687, |
|
"learning_rate": 3.8000000000000005e-06, |
|
"loss": 0.0002, |
|
"reward": 0.48504166305065155, |
|
"reward_std": 0.4368314128369093, |
|
"rewards/correctness_reward_func": 0.3333333432674408, |
|
"rewards/int_reward_func": 0.0833333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.06837500259280205, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 137.91667366027832, |
|
"epoch": 0.011240465676435166, |
|
"grad_norm": 1.0621542930603027, |
|
"kl": 0.0022582018573302776, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.0001, |
|
"reward": 0.3660416714847088, |
|
"reward_std": 0.5700555201619864, |
|
"rewards/correctness_reward_func": 0.1666666716337204, |
|
"rewards/int_reward_func": 0.1875, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.011874992400407791, |
|
"step": 21 |
|
}, |
|
{ |
|
"completion_length": 88.91667175292969, |
|
"epoch": 0.011775725946741603, |
|
"grad_norm": 1.2175222635269165, |
|
"kl": 0.0020840048528043553, |
|
"learning_rate": 4.2000000000000004e-06, |
|
"loss": 0.0001, |
|
"reward": 0.8160417033359408, |
|
"reward_std": 0.26041645370423794, |
|
"rewards/correctness_reward_func": 0.4166666865348816, |
|
"rewards/int_reward_func": 0.25, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.14937500189989805, |
|
"step": 22 |
|
}, |
|
{ |
|
"completion_length": 150.2083339691162, |
|
"epoch": 0.01231098621704804, |
|
"grad_norm": 0.8459586501121521, |
|
"kl": 0.0017513818893348798, |
|
"learning_rate": 4.4e-06, |
|
"loss": 0.0001, |
|
"reward": 0.5479583460837603, |
|
"reward_std": 0.6487118303775787, |
|
"rewards/correctness_reward_func": 0.25, |
|
"rewards/int_reward_func": 0.2916666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0062916697934269905, |
|
"step": 23 |
|
}, |
|
{ |
|
"completion_length": 154.87500381469727, |
|
"epoch": 0.012846246487354477, |
|
"grad_norm": 0.8675287961959839, |
|
"kl": 0.001371489226585254, |
|
"learning_rate": 4.600000000000001e-06, |
|
"loss": 0.0001, |
|
"reward": 0.17075001262128353, |
|
"reward_std": 0.19458706118166447, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.14583333395421505, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.024916673079133034, |
|
"step": 24 |
|
}, |
|
{ |
|
"completion_length": 150.7500057220459, |
|
"epoch": 0.013381506757660913, |
|
"grad_norm": 0.8717443943023682, |
|
"kl": 0.0020001856610178947, |
|
"learning_rate": 4.800000000000001e-06, |
|
"loss": 0.0001, |
|
"reward": 0.20204169023782015, |
|
"reward_std": 0.4490640014410019, |
|
"rewards/correctness_reward_func": 0.1666666716337204, |
|
"rewards/int_reward_func": 0.06250000186264515, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.027124996297061443, |
|
"step": 25 |
|
}, |
|
{ |
|
"completion_length": 467.9583568572998, |
|
"epoch": 0.013916767027967349, |
|
"grad_norm": 1.1116266250610352, |
|
"kl": 0.0012415697274263948, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0, |
|
"reward": 0.304541677236557, |
|
"reward_std": 0.28451096825301647, |
|
"rewards/correctness_reward_func": 0.0833333358168602, |
|
"rewards/int_reward_func": 0.14583333395421505, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.07537499908357859, |
|
"step": 26 |
|
}, |
|
{ |
|
"completion_length": 433.5833339691162, |
|
"epoch": 0.014452027298273785, |
|
"grad_norm": 0.8203855156898499, |
|
"kl": 0.001017560571199283, |
|
"learning_rate": 4.999756310023261e-06, |
|
"loss": 0.0, |
|
"reward": 0.2795000094920397, |
|
"reward_std": 0.11828233953565359, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.14583333395421505, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.13366666994988918, |
|
"step": 27 |
|
}, |
|
{ |
|
"completion_length": 126.00000381469727, |
|
"epoch": 0.014987287568580221, |
|
"grad_norm": 1.2663164138793945, |
|
"kl": 0.0014343319344334304, |
|
"learning_rate": 4.999025287600886e-06, |
|
"loss": 0.0001, |
|
"reward": 0.6117500364780426, |
|
"reward_std": 0.4831337593495846, |
|
"rewards/correctness_reward_func": 0.1666666716337204, |
|
"rewards/int_reward_func": 0.2083333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.2367500104010105, |
|
"step": 28 |
|
}, |
|
{ |
|
"completion_length": 229.66667556762695, |
|
"epoch": 0.01552254783888666, |
|
"grad_norm": 0.8950901627540588, |
|
"kl": 0.0015667550032958388, |
|
"learning_rate": 4.997807075247147e-06, |
|
"loss": 0.0001, |
|
"reward": 0.1262916720006615, |
|
"reward_std": 0.148749228566885, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.02083333395421505, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.1054583361838013, |
|
"step": 29 |
|
}, |
|
{ |
|
"completion_length": 210.8750114440918, |
|
"epoch": 0.016057808109193095, |
|
"grad_norm": 1.1234443187713623, |
|
"kl": 0.0019746975012822077, |
|
"learning_rate": 4.996101910454953e-06, |
|
"loss": 0.0001, |
|
"reward": 0.3709999993443489, |
|
"reward_std": 0.3687104620039463, |
|
"rewards/correctness_reward_func": 0.0833333358168602, |
|
"rewards/int_reward_func": 0.20833333395421505, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0793333351612091, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 177.62500190734863, |
|
"epoch": 0.01659306837949953, |
|
"grad_norm": 0.9238327145576477, |
|
"kl": 0.0028813415410695598, |
|
"learning_rate": 4.993910125649561e-06, |
|
"loss": 0.0001, |
|
"reward": 0.39008335024118423, |
|
"reward_std": 0.3615882135927677, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.27083333395421505, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.11925000231713057, |
|
"step": 31 |
|
}, |
|
{ |
|
"completion_length": 173.79167366027832, |
|
"epoch": 0.017128328649805968, |
|
"grad_norm": 1.0448777675628662, |
|
"kl": 0.006160023040138185, |
|
"learning_rate": 4.9912321481237616e-06, |
|
"loss": 0.0002, |
|
"reward": 0.14554167166352272, |
|
"reward_std": 0.16822291910648346, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0416666679084301, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.10387500375509262, |
|
"step": 32 |
|
}, |
|
{ |
|
"completion_length": 486.75000762939453, |
|
"epoch": 0.017663588920112404, |
|
"grad_norm": 0.6601158380508423, |
|
"kl": 0.001103398812119849, |
|
"learning_rate": 4.988068499954578e-06, |
|
"loss": 0.0, |
|
"reward": -0.5304166711866856, |
|
"reward_std": 1.0785409808158875, |
|
"rewards/correctness_reward_func": 0.0833333358168602, |
|
"rewards/int_reward_func": 0.06250000186264515, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.6762500237673521, |
|
"step": 33 |
|
}, |
|
{ |
|
"completion_length": 268.12501525878906, |
|
"epoch": 0.01819884919041884, |
|
"grad_norm": 0.910798966884613, |
|
"kl": 0.0013230827316874638, |
|
"learning_rate": 4.984419797901491e-06, |
|
"loss": 0.0001, |
|
"reward": 0.6703750789165497, |
|
"reward_std": 0.36514274775981903, |
|
"rewards/correctness_reward_func": 0.4166666865348816, |
|
"rewards/int_reward_func": 0.12500000558793545, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.1287083402276039, |
|
"step": 34 |
|
}, |
|
{ |
|
"completion_length": 267.708345413208, |
|
"epoch": 0.018734109460725276, |
|
"grad_norm": 1.027286171913147, |
|
"kl": 0.004269710392691195, |
|
"learning_rate": 4.980286753286196e-06, |
|
"loss": 0.0002, |
|
"reward": 0.4232500046491623, |
|
"reward_std": 0.2645573355257511, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.27083334140479565, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.1524166688323021, |
|
"step": 35 |
|
}, |
|
{ |
|
"completion_length": 245.8750114440918, |
|
"epoch": 0.019269369731031716, |
|
"grad_norm": 1.0046695470809937, |
|
"kl": 0.005482323234900832, |
|
"learning_rate": 4.975670171853926e-06, |
|
"loss": 0.0002, |
|
"reward": 0.2149583324790001, |
|
"reward_std": 0.7910580635070801, |
|
"rewards/correctness_reward_func": 0.0833333358168602, |
|
"rewards/int_reward_func": 0.2083333395421505, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.07670832984149456, |
|
"step": 36 |
|
}, |
|
{ |
|
"completion_length": 560.7916870117188, |
|
"epoch": 0.019804630001338152, |
|
"grad_norm": 0.8885159492492676, |
|
"kl": 0.002885772308218293, |
|
"learning_rate": 4.970570953616383e-06, |
|
"loss": 0.0001, |
|
"reward": 0.1944583347067237, |
|
"reward_std": 0.05626108031719923, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.1944583347067237, |
|
"step": 37 |
|
}, |
|
{ |
|
"completion_length": 522.8750133514404, |
|
"epoch": 0.020339890271644588, |
|
"grad_norm": 0.5502146482467651, |
|
"kl": 0.009517412836430594, |
|
"learning_rate": 4.964990092676263e-06, |
|
"loss": 0.0004, |
|
"reward": 0.2919999957084656, |
|
"reward_std": 0.674026682972908, |
|
"rewards/correctness_reward_func": 0.25, |
|
"rewards/int_reward_func": 0.125, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.08299997518770397, |
|
"step": 38 |
|
}, |
|
{ |
|
"completion_length": 640.2083511352539, |
|
"epoch": 0.020875150541951024, |
|
"grad_norm": 0.7062350511550903, |
|
"kl": 0.0034277847153134644, |
|
"learning_rate": 4.958928677033465e-06, |
|
"loss": 0.0001, |
|
"reward": -0.004291646182537079, |
|
"reward_std": 0.7502853199839592, |
|
"rewards/correctness_reward_func": 0.0833333358168602, |
|
"rewards/int_reward_func": 0.1250000037252903, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.21262501180171967, |
|
"step": 39 |
|
}, |
|
{ |
|
"completion_length": 491.08335876464844, |
|
"epoch": 0.02141041081225746, |
|
"grad_norm": 0.7743443250656128, |
|
"kl": 0.004902548622339964, |
|
"learning_rate": 4.9523878883729794e-06, |
|
"loss": 0.0002, |
|
"reward": 0.06212499737739563, |
|
"reward_std": 0.8643132671713829, |
|
"rewards/correctness_reward_func": 0.0833333358168602, |
|
"rewards/int_reward_func": 0.1458333395421505, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.1670416765846312, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 301.45834159851074, |
|
"epoch": 0.021945671082563897, |
|
"grad_norm": 0.7774906754493713, |
|
"kl": 0.007360402669291943, |
|
"learning_rate": 4.9453690018345144e-06, |
|
"loss": 0.0003, |
|
"reward": -0.09349998086690903, |
|
"reward_std": 0.7777874618768692, |
|
"rewards/correctness_reward_func": 0.0833333358168602, |
|
"rewards/int_reward_func": 0.1041666679084301, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.2810000069439411, |
|
"step": 41 |
|
}, |
|
{ |
|
"completion_length": 455.5833435058594, |
|
"epoch": 0.022480931352870333, |
|
"grad_norm": 0.6825084090232849, |
|
"kl": 0.0033266296959482133, |
|
"learning_rate": 4.937873385763909e-06, |
|
"loss": 0.0001, |
|
"reward": 0.2603750079870224, |
|
"reward_std": 1.1693618446588516, |
|
"rewards/correctness_reward_func": 0.2500000074505806, |
|
"rewards/int_reward_func": 0.1458333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.02083333395421505, |
|
"rewards/xmlcount_reward_func": -0.15629167575389147, |
|
"step": 42 |
|
}, |
|
{ |
|
"completion_length": 254.7500057220459, |
|
"epoch": 0.02301619162317677, |
|
"grad_norm": 1.0424985885620117, |
|
"kl": 0.013458715460728854, |
|
"learning_rate": 4.9299025014463665e-06, |
|
"loss": 0.0005, |
|
"reward": 0.6490416824817657, |
|
"reward_std": 0.6297374591231346, |
|
"rewards/correctness_reward_func": 0.1666666716337204, |
|
"rewards/int_reward_func": 0.3125000111758709, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.16987500933464617, |
|
"step": 43 |
|
}, |
|
{ |
|
"completion_length": 243.8333339691162, |
|
"epoch": 0.023551451893483205, |
|
"grad_norm": 0.6618691682815552, |
|
"kl": 0.013514326536096632, |
|
"learning_rate": 4.921457902821578e-06, |
|
"loss": 0.0005, |
|
"reward": 0.235000004991889, |
|
"reward_std": 0.4793561212718487, |
|
"rewards/correctness_reward_func": 0.0833333358168602, |
|
"rewards/int_reward_func": 0.12500000558793545, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.02666667103767395, |
|
"step": 44 |
|
}, |
|
{ |
|
"completion_length": 247.1250057220459, |
|
"epoch": 0.02408671216378964, |
|
"grad_norm": 0.8150098919868469, |
|
"kl": 0.006718623684719205, |
|
"learning_rate": 4.912541236180779e-06, |
|
"loss": 0.0003, |
|
"reward": 0.3942916840314865, |
|
"reward_std": 0.4090446010231972, |
|
"rewards/correctness_reward_func": 0.1666666716337204, |
|
"rewards/int_reward_func": 0.1458333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.08179166866466403, |
|
"step": 45 |
|
}, |
|
{ |
|
"completion_length": 311.95833587646484, |
|
"epoch": 0.02462197243409608, |
|
"grad_norm": 0.9185469746589661, |
|
"kl": 0.011615818890277296, |
|
"learning_rate": 4.903154239845798e-06, |
|
"loss": 0.0005, |
|
"reward": 0.6513333357870579, |
|
"reward_std": 0.26453326642513275, |
|
"rewards/correctness_reward_func": 0.5, |
|
"rewards/int_reward_func": 0.14583333395421505, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.005500006955116987, |
|
"step": 46 |
|
}, |
|
{ |
|
"completion_length": 347.25000762939453, |
|
"epoch": 0.025157232704402517, |
|
"grad_norm": 0.9208869338035583, |
|
"kl": 0.019979659002274275, |
|
"learning_rate": 4.893298743830168e-06, |
|
"loss": 0.0008, |
|
"reward": 0.4205416589975357, |
|
"reward_std": 0.4555768258869648, |
|
"rewards/correctness_reward_func": 0.25, |
|
"rewards/int_reward_func": 0.0625, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.108041662722826, |
|
"step": 47 |
|
}, |
|
{ |
|
"completion_length": 245.6666717529297, |
|
"epoch": 0.025692492974708953, |
|
"grad_norm": 0.8743010759353638, |
|
"kl": 0.01431413902901113, |
|
"learning_rate": 4.882976669482368e-06, |
|
"loss": 0.0006, |
|
"reward": 0.269333329051733, |
|
"reward_std": 0.6685181586071849, |
|
"rewards/correctness_reward_func": 0.0833333358168602, |
|
"rewards/int_reward_func": 0.125, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.06099999323487282, |
|
"step": 48 |
|
}, |
|
{ |
|
"completion_length": 431.12501525878906, |
|
"epoch": 0.02622775324501539, |
|
"grad_norm": 0.6829207539558411, |
|
"kl": 0.010582708870060742, |
|
"learning_rate": 4.8721900291112415e-06, |
|
"loss": 0.0004, |
|
"reward": 1.1573750227689743, |
|
"reward_std": 0.5191336497664452, |
|
"rewards/correctness_reward_func": 0.75, |
|
"rewards/int_reward_func": 0.27083333767950535, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.1365416720509529, |
|
"step": 49 |
|
}, |
|
{ |
|
"completion_length": 362.2916793823242, |
|
"epoch": 0.026763013515321826, |
|
"grad_norm": 0.9591223001480103, |
|
"kl": 0.021306635811924934, |
|
"learning_rate": 4.860940925593703e-06, |
|
"loss": 0.0009, |
|
"reward": 0.5036666616797447, |
|
"reward_std": 0.6060219556093216, |
|
"rewards/correctness_reward_func": 0.0833333358168602, |
|
"rewards/int_reward_func": 0.16666666977107525, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.06250000186264515, |
|
"rewards/xmlcount_reward_func": 0.19116667530033737, |
|
"step": 50 |
|
}, |
|
{ |
|
"completion_length": 105.25, |
|
"epoch": 0.02729827378562826, |
|
"grad_norm": 1.2040213346481323, |
|
"kl": 0.03592631733044982, |
|
"learning_rate": 4.849231551964771e-06, |
|
"loss": 0.0014, |
|
"reward": 0.9875416681170464, |
|
"reward_std": 0.5127010717988014, |
|
"rewards/correctness_reward_func": 0.4166666865348816, |
|
"rewards/int_reward_func": 0.20833334140479565, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0416666679084301, |
|
"rewards/xmlcount_reward_func": 0.32087502628564835, |
|
"step": 51 |
|
}, |
|
{ |
|
"completion_length": 105.50000190734863, |
|
"epoch": 0.027833534055934698, |
|
"grad_norm": 0.9755409359931946, |
|
"kl": 0.03177254740148783, |
|
"learning_rate": 4.837064190990036e-06, |
|
"loss": 0.0013, |
|
"reward": 0.36112499982118607, |
|
"reward_std": 0.23680441081523895, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.14583333395421505, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.02083333395421505, |
|
"rewards/xmlcount_reward_func": 0.19445833936333656, |
|
"step": 52 |
|
}, |
|
{ |
|
"completion_length": 115.08333778381348, |
|
"epoch": 0.028368794326241134, |
|
"grad_norm": 1.0774939060211182, |
|
"kl": 0.04096163995563984, |
|
"learning_rate": 4.824441214720629e-06, |
|
"loss": 0.0016, |
|
"reward": 0.39137500151991844, |
|
"reward_std": 0.21243570372462273, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.125, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0416666679084301, |
|
"rewards/xmlcount_reward_func": 0.22470833733677864, |
|
"step": 53 |
|
}, |
|
{ |
|
"completion_length": 71.87500095367432, |
|
"epoch": 0.02890405459654757, |
|
"grad_norm": 1.5860177278518677, |
|
"kl": 0.07264666631817818, |
|
"learning_rate": 4.811365084030784e-06, |
|
"loss": 0.0029, |
|
"reward": 0.7297500222921371, |
|
"reward_std": 0.40594012290239334, |
|
"rewards/correctness_reward_func": 0.0833333358168602, |
|
"rewards/int_reward_func": 0.25000000558793545, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0416666679084301, |
|
"rewards/xmlcount_reward_func": 0.35475001111626625, |
|
"step": 54 |
|
}, |
|
{ |
|
"completion_length": 335.0416831970215, |
|
"epoch": 0.029439314866854006, |
|
"grad_norm": 0.8996623158454895, |
|
"kl": 0.03017168352380395, |
|
"learning_rate": 4.7978383481380865e-06, |
|
"loss": 0.0012, |
|
"reward": 0.21566667163278908, |
|
"reward_std": 0.38637750223279, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0625, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0416666679084301, |
|
"rewards/xmlcount_reward_func": 0.11149999999906868, |
|
"step": 55 |
|
}, |
|
{ |
|
"completion_length": 116.00000476837158, |
|
"epoch": 0.029974575137160443, |
|
"grad_norm": 1.1853581666946411, |
|
"kl": 0.03844497771933675, |
|
"learning_rate": 4.783863644106502e-06, |
|
"loss": 0.0015, |
|
"reward": 0.35095833986997604, |
|
"reward_std": 0.21118691470474005, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.125, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0416666679084301, |
|
"rewards/xmlcount_reward_func": 0.18429166823625565, |
|
"step": 56 |
|
}, |
|
{ |
|
"completion_length": 203.58334350585938, |
|
"epoch": 0.030509835407466882, |
|
"grad_norm": 0.9677571654319763, |
|
"kl": 0.03654019068926573, |
|
"learning_rate": 4.769443696332272e-06, |
|
"loss": 0.0015, |
|
"reward": 0.867875000461936, |
|
"reward_std": 0.6674522012472153, |
|
"rewards/correctness_reward_func": 0.3333333432674408, |
|
"rewards/int_reward_func": 0.2708333395421505, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.08333333395421505, |
|
"rewards/xmlcount_reward_func": 0.1803750041872263, |
|
"step": 57 |
|
}, |
|
{ |
|
"completion_length": 226.58334159851074, |
|
"epoch": 0.03104509567777332, |
|
"grad_norm": 0.7546242475509644, |
|
"kl": 0.10618894919753075, |
|
"learning_rate": 4.754581316012785e-06, |
|
"loss": 0.0042, |
|
"reward": 0.7405833136290312, |
|
"reward_std": 1.0614993423223495, |
|
"rewards/correctness_reward_func": 0.25, |
|
"rewards/int_reward_func": 0.2708333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.125, |
|
"rewards/xmlcount_reward_func": 0.09474998340010643, |
|
"step": 58 |
|
}, |
|
{ |
|
"completion_length": 96.75000190734863, |
|
"epoch": 0.03158035594807975, |
|
"grad_norm": 0.9959896206855774, |
|
"kl": 0.040049958042800426, |
|
"learning_rate": 4.7392794005985324e-06, |
|
"loss": 0.0016, |
|
"reward": 1.0202916860580444, |
|
"reward_std": 0.5125212594866753, |
|
"rewards/correctness_reward_func": 0.2500000074505806, |
|
"rewards/int_reward_func": 0.4166666679084301, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.35362500697374344, |
|
"step": 59 |
|
}, |
|
{ |
|
"completion_length": 77.83333587646484, |
|
"epoch": 0.03211561621838619, |
|
"grad_norm": 1.0476152896881104, |
|
"kl": 0.08419935218989849, |
|
"learning_rate": 4.723540933228245e-06, |
|
"loss": 0.0034, |
|
"reward": 0.6385000422596931, |
|
"reward_std": 0.23676574788987637, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.27083333767950535, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.36766667664051056, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 76.0416669845581, |
|
"epoch": 0.03265087648869262, |
|
"grad_norm": 1.6621463298797607, |
|
"kl": 0.06288609141483903, |
|
"learning_rate": 4.707368982147318e-06, |
|
"loss": 0.0025, |
|
"reward": 0.7950416952371597, |
|
"reward_std": 0.2815567087382078, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.2916666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.1458333395421505, |
|
"rewards/xmlcount_reward_func": 0.35754168033599854, |
|
"step": 61 |
|
}, |
|
{ |
|
"completion_length": 97.87500190734863, |
|
"epoch": 0.03318613675899906, |
|
"grad_norm": 1.3513109683990479, |
|
"kl": 0.07356535829603672, |
|
"learning_rate": 4.690766700109659e-06, |
|
"loss": 0.0029, |
|
"reward": 1.054708331823349, |
|
"reward_std": 0.3879717066884041, |
|
"rewards/correctness_reward_func": 0.1666666716337204, |
|
"rewards/int_reward_func": 0.4791666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.06250000186264515, |
|
"rewards/xmlcount_reward_func": 0.3463750060182065, |
|
"step": 62 |
|
}, |
|
{ |
|
"completion_length": 356.79167556762695, |
|
"epoch": 0.0337213970293055, |
|
"grad_norm": 0.9732988476753235, |
|
"kl": 0.0708354264497757, |
|
"learning_rate": 4.673737323763048e-06, |
|
"loss": 0.0028, |
|
"reward": 0.3889166936278343, |
|
"reward_std": 1.0406904257833958, |
|
"rewards/correctness_reward_func": 0.1666666716337204, |
|
"rewards/int_reward_func": 0.0833333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0833333358168602, |
|
"rewards/xmlcount_reward_func": 0.05558334290981293, |
|
"step": 63 |
|
}, |
|
{ |
|
"completion_length": 124.16666984558105, |
|
"epoch": 0.034256657299611935, |
|
"grad_norm": 1.4622650146484375, |
|
"kl": 0.08344197925180197, |
|
"learning_rate": 4.656284173018144e-06, |
|
"loss": 0.0033, |
|
"reward": 1.0468750149011612, |
|
"reward_std": 0.7830385342240334, |
|
"rewards/correctness_reward_func": 0.25, |
|
"rewards/int_reward_func": 0.2708333432674408, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.16666666977107525, |
|
"rewards/xmlcount_reward_func": 0.3593750074505806, |
|
"step": 64 |
|
}, |
|
{ |
|
"completion_length": 233.333345413208, |
|
"epoch": 0.034791917569918375, |
|
"grad_norm": 2.0504956245422363, |
|
"kl": 0.07171727810055017, |
|
"learning_rate": 4.638410650401267e-06, |
|
"loss": 0.0029, |
|
"reward": 1.2617916613817215, |
|
"reward_std": 0.8660007119178772, |
|
"rewards/correctness_reward_func": 0.5000000149011612, |
|
"rewards/int_reward_func": 0.3750000074505806, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.3242916837334633, |
|
"step": 65 |
|
}, |
|
{ |
|
"completion_length": 147.5833339691162, |
|
"epoch": 0.03532717784022481, |
|
"grad_norm": 0.6744219660758972, |
|
"kl": 0.11512961238622665, |
|
"learning_rate": 4.620120240391065e-06, |
|
"loss": 0.0046, |
|
"reward": 0.8016250282526016, |
|
"reward_std": 0.29210690781474113, |
|
"rewards/correctness_reward_func": 0.0833333358168602, |
|
"rewards/int_reward_func": 0.2291666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0416666679084301, |
|
"rewards/xmlcount_reward_func": 0.44745834171772003, |
|
"step": 66 |
|
}, |
|
{ |
|
"completion_length": 76.41666793823242, |
|
"epoch": 0.03586243811053125, |
|
"grad_norm": 0.8137429356575012, |
|
"kl": 0.11052755452692509, |
|
"learning_rate": 4.601416508739211e-06, |
|
"loss": 0.0044, |
|
"reward": 1.2232083678245544, |
|
"reward_std": 0.6085939556360245, |
|
"rewards/correctness_reward_func": 0.3333333432674408, |
|
"rewards/int_reward_func": 0.25, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.18750000558793545, |
|
"rewards/xmlcount_reward_func": 0.45237500965595245, |
|
"step": 67 |
|
}, |
|
{ |
|
"completion_length": 103.91666984558105, |
|
"epoch": 0.03639769838083768, |
|
"grad_norm": 1.3938682079315186, |
|
"kl": 0.10067875497043133, |
|
"learning_rate": 4.582303101775249e-06, |
|
"loss": 0.004, |
|
"reward": 0.7972083538770676, |
|
"reward_std": 0.43218278884887695, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.14583333767950535, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.22916667349636555, |
|
"rewards/xmlcount_reward_func": 0.42220834642648697, |
|
"step": 68 |
|
}, |
|
{ |
|
"completion_length": 607.5000114440918, |
|
"epoch": 0.03693295865114412, |
|
"grad_norm": 0.7114788293838501, |
|
"kl": 0.059856235020561144, |
|
"learning_rate": 4.562783745695738e-06, |
|
"loss": 0.0024, |
|
"reward": 0.6020833402872086, |
|
"reward_std": 0.26822593063116074, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.1875000074505806, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.1041666716337204, |
|
"rewards/xmlcount_reward_func": 0.31041666865348816, |
|
"step": 69 |
|
}, |
|
{ |
|
"completion_length": 366.1666736602783, |
|
"epoch": 0.03746821892145055, |
|
"grad_norm": 1.5701572895050049, |
|
"kl": 0.12799374386668205, |
|
"learning_rate": 4.542862245837821e-06, |
|
"loss": 0.0051, |
|
"reward": 0.6387916915118694, |
|
"reward_std": 0.3272341303527355, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.16666667349636555, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.14583333395421505, |
|
"rewards/xmlcount_reward_func": 0.3262916784733534, |
|
"step": 70 |
|
}, |
|
{ |
|
"completion_length": 150.9166717529297, |
|
"epoch": 0.03800347919175699, |
|
"grad_norm": 1.3559887409210205, |
|
"kl": 0.17991142719984055, |
|
"learning_rate": 4.522542485937369e-06, |
|
"loss": 0.0072, |
|
"reward": 0.8541667014360428, |
|
"reward_std": 0.28774577379226685, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.125, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.2500000074505806, |
|
"rewards/xmlcount_reward_func": 0.4791666716337204, |
|
"step": 71 |
|
}, |
|
{ |
|
"completion_length": 92.08333778381348, |
|
"epoch": 0.03853873946206343, |
|
"grad_norm": 0.9042914509773254, |
|
"kl": 0.21130416169762611, |
|
"learning_rate": 4.501828427371834e-06, |
|
"loss": 0.0085, |
|
"reward": 0.7811249941587448, |
|
"reward_std": 0.20265305042266846, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.125, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.1666666679084301, |
|
"rewards/xmlcount_reward_func": 0.4894583374261856, |
|
"step": 72 |
|
}, |
|
{ |
|
"completion_length": 125.45833778381348, |
|
"epoch": 0.039073999732369864, |
|
"grad_norm": 0.8635703921318054, |
|
"kl": 0.13566209375858307, |
|
"learning_rate": 4.4807241083879774e-06, |
|
"loss": 0.0054, |
|
"reward": 1.1882500350475311, |
|
"reward_std": 0.8479792177677155, |
|
"rewards/correctness_reward_func": 0.3333333432674408, |
|
"rewards/int_reward_func": 0.1875000037252903, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.2708333358168602, |
|
"rewards/xmlcount_reward_func": 0.39658334106206894, |
|
"step": 73 |
|
}, |
|
{ |
|
"completion_length": 68.79166984558105, |
|
"epoch": 0.039609260002676304, |
|
"grad_norm": 1.192706823348999, |
|
"kl": 0.27350207418203354, |
|
"learning_rate": 4.4592336433146e-06, |
|
"loss": 0.0109, |
|
"reward": 1.7708333730697632, |
|
"reward_std": 0.11558076366782188, |
|
"rewards/correctness_reward_func": 0.5, |
|
"rewards/int_reward_func": 0.3125, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4583333432674408, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 74 |
|
}, |
|
{ |
|
"completion_length": 90.79166984558105, |
|
"epoch": 0.04014452027298274, |
|
"grad_norm": 1.3415632247924805, |
|
"kl": 0.15339597314596176, |
|
"learning_rate": 4.437361221760449e-06, |
|
"loss": 0.0061, |
|
"reward": 1.0463333874940872, |
|
"reward_std": 0.5073548853397369, |
|
"rewards/correctness_reward_func": 0.25, |
|
"rewards/int_reward_func": 0.125, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.2083333358168602, |
|
"rewards/xmlcount_reward_func": 0.46300000697374344, |
|
"step": 75 |
|
}, |
|
{ |
|
"completion_length": 163.20834159851074, |
|
"epoch": 0.040679780543289176, |
|
"grad_norm": 1.5307588577270508, |
|
"kl": 0.1565667698159814, |
|
"learning_rate": 4.415111107797445e-06, |
|
"loss": 0.0063, |
|
"reward": 1.239583358168602, |
|
"reward_std": 0.48138320073485374, |
|
"rewards/correctness_reward_func": 0.25, |
|
"rewards/int_reward_func": 0.2083333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.3125000074505806, |
|
"rewards/xmlcount_reward_func": 0.4687500074505806, |
|
"step": 76 |
|
}, |
|
{ |
|
"completion_length": 162.20834159851074, |
|
"epoch": 0.04121504081359561, |
|
"grad_norm": 1.7075368165969849, |
|
"kl": 0.2482675537467003, |
|
"learning_rate": 4.3924876391293915e-06, |
|
"loss": 0.0099, |
|
"reward": 1.5780000388622284, |
|
"reward_std": 0.7741712592542171, |
|
"rewards/correctness_reward_func": 0.416666679084301, |
|
"rewards/int_reward_func": 0.3333333469927311, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.37500000558793545, |
|
"rewards/xmlcount_reward_func": 0.453000009059906, |
|
"step": 77 |
|
}, |
|
{ |
|
"completion_length": 90.83333587646484, |
|
"epoch": 0.04175030108390205, |
|
"grad_norm": 1.605960726737976, |
|
"kl": 0.1952410712838173, |
|
"learning_rate": 4.36949522624633e-06, |
|
"loss": 0.0078, |
|
"reward": 1.7031250596046448, |
|
"reward_std": 0.5450708866119385, |
|
"rewards/correctness_reward_func": 0.6666666865348816, |
|
"rewards/int_reward_func": 0.2083333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.3333333395421505, |
|
"rewards/xmlcount_reward_func": 0.4947916716337204, |
|
"step": 78 |
|
}, |
|
{ |
|
"completion_length": 136.16666984558105, |
|
"epoch": 0.04228556135420848, |
|
"grad_norm": 0.673092246055603, |
|
"kl": 0.1753321774303913, |
|
"learning_rate": 4.346138351564711e-06, |
|
"loss": 0.007, |
|
"reward": 1.2291666716337204, |
|
"reward_std": 0.4924144148826599, |
|
"rewards/correctness_reward_func": 0.1666666716337204, |
|
"rewards/int_reward_func": 0.291666679084301, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.29166667349636555, |
|
"rewards/xmlcount_reward_func": 0.4791666716337204, |
|
"step": 79 |
|
}, |
|
{ |
|
"completion_length": 71.20833683013916, |
|
"epoch": 0.04282082162451492, |
|
"grad_norm": 1.5928924083709717, |
|
"kl": 0.31612617522478104, |
|
"learning_rate": 4.322421568553529e-06, |
|
"loss": 0.0126, |
|
"reward": 2.0416666865348816, |
|
"reward_std": 0.20412416756153107, |
|
"rewards/correctness_reward_func": 0.5833333358168602, |
|
"rewards/int_reward_func": 0.4791666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4791666716337204, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 80 |
|
}, |
|
{ |
|
"completion_length": 112.33333492279053, |
|
"epoch": 0.043356081894821354, |
|
"grad_norm": 0.4769633412361145, |
|
"kl": 0.21202785894274712, |
|
"learning_rate": 4.2983495008466285e-06, |
|
"loss": 0.0085, |
|
"reward": 1.7968750447034836, |
|
"reward_std": 0.15385404229164124, |
|
"rewards/correctness_reward_func": 0.5, |
|
"rewards/int_reward_func": 0.375, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4375000074505806, |
|
"rewards/xmlcount_reward_func": 0.484375, |
|
"step": 81 |
|
}, |
|
{ |
|
"completion_length": 175.41667366027832, |
|
"epoch": 0.04389134216512779, |
|
"grad_norm": 0.9674685597419739, |
|
"kl": 0.13749209698289633, |
|
"learning_rate": 4.273926841341303e-06, |
|
"loss": 0.0055, |
|
"reward": 1.6510417461395264, |
|
"reward_std": 0.5754482969641685, |
|
"rewards/correctness_reward_func": 0.416666679084301, |
|
"rewards/int_reward_func": 0.4166666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.33333334140479565, |
|
"rewards/xmlcount_reward_func": 0.484375, |
|
"step": 82 |
|
}, |
|
{ |
|
"completion_length": 67.87500190734863, |
|
"epoch": 0.04442660243543423, |
|
"grad_norm": 1.0911532640457153, |
|
"kl": 0.28480928763747215, |
|
"learning_rate": 4.249158351283414e-06, |
|
"loss": 0.0114, |
|
"reward": 1.4166666865348816, |
|
"reward_std": 0.12909945845603943, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.4791666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4375000074505806, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 83 |
|
}, |
|
{ |
|
"completion_length": 71.91666889190674, |
|
"epoch": 0.044961862705740666, |
|
"grad_norm": 1.9787883758544922, |
|
"kl": 0.2760180849581957, |
|
"learning_rate": 4.224048859339175e-06, |
|
"loss": 0.011, |
|
"reward": 1.1250000596046448, |
|
"reward_std": 0.20412414520978928, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.25000000558793545, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.37500000558793545, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 84 |
|
}, |
|
{ |
|
"completion_length": 124.37500190734863, |
|
"epoch": 0.045497122976047105, |
|
"grad_norm": 0.5233182907104492, |
|
"kl": 0.20504293218255043, |
|
"learning_rate": 4.198603260653792e-06, |
|
"loss": 0.0082, |
|
"reward": 1.2343750298023224, |
|
"reward_std": 0.3668263405561447, |
|
"rewards/correctness_reward_func": 0.1666666716337204, |
|
"rewards/int_reward_func": 0.3541666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.25, |
|
"rewards/xmlcount_reward_func": 0.4635416716337204, |
|
"step": 85 |
|
}, |
|
{ |
|
"completion_length": 101.62500381469727, |
|
"epoch": 0.04603238324635354, |
|
"grad_norm": 1.2110995054244995, |
|
"kl": 0.15592870488762856, |
|
"learning_rate": 4.172826515897146e-06, |
|
"loss": 0.0062, |
|
"reward": 1.7083334028720856, |
|
"reward_std": 0.5623037368059158, |
|
"rewards/correctness_reward_func": 0.5000000223517418, |
|
"rewards/int_reward_func": 0.2291666753590107, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4791666716337204, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 86 |
|
}, |
|
{ |
|
"completion_length": 126.25000762939453, |
|
"epoch": 0.04656764351665998, |
|
"grad_norm": 0.7313621640205383, |
|
"kl": 0.18805699050426483, |
|
"learning_rate": 4.146723650296701e-06, |
|
"loss": 0.0075, |
|
"reward": 1.3646250218153, |
|
"reward_std": 0.3509002774953842, |
|
"rewards/correctness_reward_func": 0.3333333432674408, |
|
"rewards/int_reward_func": 0.14583333395421505, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.416666679084301, |
|
"rewards/xmlcount_reward_func": 0.4687916710972786, |
|
"step": 87 |
|
}, |
|
{ |
|
"completion_length": 209.45833683013916, |
|
"epoch": 0.04710290378696641, |
|
"grad_norm": 0.725437343120575, |
|
"kl": 0.16482173651456833, |
|
"learning_rate": 4.120299752657828e-06, |
|
"loss": 0.0066, |
|
"reward": 1.046916663646698, |
|
"reward_std": 0.4301242418587208, |
|
"rewards/correctness_reward_func": 0.1666666716337204, |
|
"rewards/int_reward_func": 0.1875000074505806, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.2291666716337204, |
|
"rewards/xmlcount_reward_func": 0.4635833352804184, |
|
"step": 88 |
|
}, |
|
{ |
|
"completion_length": 165.0833396911621, |
|
"epoch": 0.04763816405727285, |
|
"grad_norm": 0.7724325656890869, |
|
"kl": 0.20376655086874962, |
|
"learning_rate": 4.093559974371725e-06, |
|
"loss": 0.0082, |
|
"reward": 1.0260416865348816, |
|
"reward_std": 0.1994822435081005, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.1250000037252903, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4166666716337204, |
|
"rewards/xmlcount_reward_func": 0.484375, |
|
"step": 89 |
|
}, |
|
{ |
|
"completion_length": 98.66666984558105, |
|
"epoch": 0.04817342432757928, |
|
"grad_norm": 1.395373821258545, |
|
"kl": 0.2529403530061245, |
|
"learning_rate": 4.066509528411151e-06, |
|
"loss": 0.0101, |
|
"reward": 1.0780000239610672, |
|
"reward_std": 0.2602536380290985, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.1875000074505806, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.3958333432674408, |
|
"rewards/xmlcount_reward_func": 0.4946666657924652, |
|
"step": 90 |
|
}, |
|
{ |
|
"completion_length": 89.25, |
|
"epoch": 0.04870868459788572, |
|
"grad_norm": 0.8370155692100525, |
|
"kl": 0.19209491834044456, |
|
"learning_rate": 4.039153688314146e-06, |
|
"loss": 0.0077, |
|
"reward": 1.2864583879709244, |
|
"reward_std": 0.43756843730807304, |
|
"rewards/correctness_reward_func": 0.25, |
|
"rewards/int_reward_func": 0.0833333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4583333432674408, |
|
"rewards/xmlcount_reward_func": 0.4947916716337204, |
|
"step": 91 |
|
}, |
|
{ |
|
"completion_length": 63.333335876464844, |
|
"epoch": 0.04924394486819216, |
|
"grad_norm": 2.1854286193847656, |
|
"kl": 0.21549397706985474, |
|
"learning_rate": 4.011497787155938e-06, |
|
"loss": 0.0086, |
|
"reward": 1.2964583337306976, |
|
"reward_std": 0.1316254585981369, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.3541666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4583333358168602, |
|
"rewards/xmlcount_reward_func": 0.48395833373069763, |
|
"step": 92 |
|
}, |
|
{ |
|
"completion_length": 88.62500190734863, |
|
"epoch": 0.049779205138498595, |
|
"grad_norm": 1.6499943733215332, |
|
"kl": 0.2516642101109028, |
|
"learning_rate": 3.983547216509254e-06, |
|
"loss": 0.0101, |
|
"reward": 1.6613333523273468, |
|
"reward_std": 0.6053861007094383, |
|
"rewards/correctness_reward_func": 0.4166666716337204, |
|
"rewards/int_reward_func": 0.3541666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.3958333358168602, |
|
"rewards/xmlcount_reward_func": 0.4946666657924652, |
|
"step": 93 |
|
}, |
|
{ |
|
"completion_length": 134.12500381469727, |
|
"epoch": 0.050314465408805034, |
|
"grad_norm": 0.9040616154670715, |
|
"kl": 0.18027858808636665, |
|
"learning_rate": 3.955307425393224e-06, |
|
"loss": 0.0072, |
|
"reward": 1.1510417014360428, |
|
"reward_std": 0.4075661599636078, |
|
"rewards/correctness_reward_func": 0.1666666716337204, |
|
"rewards/int_reward_func": 0.125, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.3750000037252903, |
|
"rewards/xmlcount_reward_func": 0.484375, |
|
"step": 94 |
|
}, |
|
{ |
|
"completion_length": 143.12500381469727, |
|
"epoch": 0.05084972567911147, |
|
"grad_norm": 0.8769562244415283, |
|
"kl": 0.2623859569430351, |
|
"learning_rate": 3.92678391921108e-06, |
|
"loss": 0.0105, |
|
"reward": 1.3593750298023224, |
|
"reward_std": 0.3879491835832596, |
|
"rewards/correctness_reward_func": 0.0833333358168602, |
|
"rewards/int_reward_func": 0.3958333395421505, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.3958333358168602, |
|
"rewards/xmlcount_reward_func": 0.484375, |
|
"step": 95 |
|
}, |
|
{ |
|
"completion_length": 80.58333587646484, |
|
"epoch": 0.051384985949417906, |
|
"grad_norm": 1.6260957717895508, |
|
"kl": 0.2531866990029812, |
|
"learning_rate": 3.897982258676867e-06, |
|
"loss": 0.0101, |
|
"reward": 1.3281250298023224, |
|
"reward_std": 0.37103308364748955, |
|
"rewards/correctness_reward_func": 0.0833333358168602, |
|
"rewards/int_reward_func": 0.33333334140479565, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4375000074505806, |
|
"rewards/xmlcount_reward_func": 0.4739583432674408, |
|
"step": 96 |
|
}, |
|
{ |
|
"completion_length": 104.91666889190674, |
|
"epoch": 0.05192024621972434, |
|
"grad_norm": 0.8191968202590942, |
|
"kl": 0.27327974885702133, |
|
"learning_rate": 3.868908058731376e-06, |
|
"loss": 0.0109, |
|
"reward": 1.1041666865348816, |
|
"reward_std": 0.16337091475725174, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.2500000037252903, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.375, |
|
"rewards/xmlcount_reward_func": 0.4791666716337204, |
|
"step": 97 |
|
}, |
|
{ |
|
"completion_length": 62.00000190734863, |
|
"epoch": 0.05245550649003078, |
|
"grad_norm": 0.3426864743232727, |
|
"kl": 0.2767893858253956, |
|
"learning_rate": 3.839566987447492e-06, |
|
"loss": 0.0111, |
|
"reward": 1.3541666865348816, |
|
"reward_std": 0.05103103816509247, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.375, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4791666716337204, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 98 |
|
}, |
|
{ |
|
"completion_length": 389.2500114440918, |
|
"epoch": 0.05299076676033721, |
|
"grad_norm": 1.0328341722488403, |
|
"kl": 0.1228889636695385, |
|
"learning_rate": 3.8099647649251984e-06, |
|
"loss": 0.0049, |
|
"reward": 1.067708358168602, |
|
"reward_std": 0.422527939081192, |
|
"rewards/correctness_reward_func": 0.0833333358168602, |
|
"rewards/int_reward_func": 0.27083333395421505, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.2916666716337204, |
|
"rewards/xmlcount_reward_func": 0.421875, |
|
"step": 99 |
|
}, |
|
{ |
|
"completion_length": 73.62500286102295, |
|
"epoch": 0.05352602703064365, |
|
"grad_norm": 0.7198861241340637, |
|
"kl": 0.2295570969581604, |
|
"learning_rate": 3.780107162176429e-06, |
|
"loss": 0.0092, |
|
"reward": 1.375, |
|
"reward_std": 0.2885505259037018, |
|
"rewards/correctness_reward_func": 0.0833333358168602, |
|
"rewards/int_reward_func": 0.2916666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.5, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 100 |
|
}, |
|
{ |
|
"completion_length": 247.083345413208, |
|
"epoch": 0.054061287300950084, |
|
"grad_norm": 0.8113954663276672, |
|
"kl": 0.2137942397966981, |
|
"learning_rate": 3.7500000000000005e-06, |
|
"loss": 0.0086, |
|
"reward": 1.0833750367164612, |
|
"reward_std": 0.4043814614415169, |
|
"rewards/correctness_reward_func": 0.0833333358168602, |
|
"rewards/int_reward_func": 0.29166667349636555, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.2916666716337204, |
|
"rewards/xmlcount_reward_func": 0.4167083315551281, |
|
"step": 101 |
|
}, |
|
{ |
|
"completion_length": 161.08334159851074, |
|
"epoch": 0.05459654757125652, |
|
"grad_norm": 0.9631187915802002, |
|
"kl": 0.320892296731472, |
|
"learning_rate": 3.7196491478468322e-06, |
|
"loss": 0.0128, |
|
"reward": 1.7552083432674408, |
|
"reward_std": 0.20395417511463165, |
|
"rewards/correctness_reward_func": 0.5, |
|
"rewards/int_reward_func": 0.3541666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4166666716337204, |
|
"rewards/xmlcount_reward_func": 0.484375, |
|
"step": 102 |
|
}, |
|
{ |
|
"completion_length": 86.87500286102295, |
|
"epoch": 0.05513180784156296, |
|
"grad_norm": 0.5522407293319702, |
|
"kl": 0.19866621680557728, |
|
"learning_rate": 3.689060522675689e-06, |
|
"loss": 0.0079, |
|
"reward": 1.6974583566188812, |
|
"reward_std": 0.43488648533821106, |
|
"rewards/correctness_reward_func": 0.4166666865348816, |
|
"rewards/int_reward_func": 0.3333333432674408, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4583333432674408, |
|
"rewards/xmlcount_reward_func": 0.48912499845027924, |
|
"step": 103 |
|
}, |
|
{ |
|
"completion_length": 116.37500381469727, |
|
"epoch": 0.055667068111869396, |
|
"grad_norm": 0.39440569281578064, |
|
"kl": 0.2107317578047514, |
|
"learning_rate": 3.658240087799655e-06, |
|
"loss": 0.0084, |
|
"reward": 1.4635416865348816, |
|
"reward_std": 0.3243303596973419, |
|
"rewards/correctness_reward_func": 0.0833333358168602, |
|
"rewards/int_reward_func": 0.4375000074505806, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4583333432674408, |
|
"rewards/xmlcount_reward_func": 0.484375, |
|
"step": 104 |
|
}, |
|
{ |
|
"completion_length": 115.41666984558105, |
|
"epoch": 0.056202328382175835, |
|
"grad_norm": 1.1680481433868408, |
|
"kl": 0.4403987228870392, |
|
"learning_rate": 3.627193851723577e-06, |
|
"loss": 0.0176, |
|
"reward": 1.2552083730697632, |
|
"reward_std": 0.2559161148965359, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.3125000074505806, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4583333432674408, |
|
"rewards/xmlcount_reward_func": 0.484375, |
|
"step": 105 |
|
}, |
|
{ |
|
"completion_length": 69.83333396911621, |
|
"epoch": 0.05673758865248227, |
|
"grad_norm": 0.4940861463546753, |
|
"kl": 0.27071962505578995, |
|
"learning_rate": 3.595927866972694e-06, |
|
"loss": 0.0108, |
|
"reward": 1.1666666865348816, |
|
"reward_std": 0.06454972922801971, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.1666666679084301, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.5, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 106 |
|
}, |
|
{ |
|
"completion_length": 94.45833587646484, |
|
"epoch": 0.05727284892278871, |
|
"grad_norm": 1.4608356952667236, |
|
"kl": 0.16904586926102638, |
|
"learning_rate": 3.564448228912682e-06, |
|
"loss": 0.0068, |
|
"reward": 1.4895833432674408, |
|
"reward_std": 0.4765794351696968, |
|
"rewards/correctness_reward_func": 0.1666666716337204, |
|
"rewards/int_reward_func": 0.4166666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.416666679084301, |
|
"rewards/xmlcount_reward_func": 0.4895833432674408, |
|
"step": 107 |
|
}, |
|
{ |
|
"completion_length": 130.75000190734863, |
|
"epoch": 0.05780810919309514, |
|
"grad_norm": 1.187828779220581, |
|
"kl": 0.2347713652998209, |
|
"learning_rate": 3.532761074561355e-06, |
|
"loss": 0.0094, |
|
"reward": 1.0625000298023224, |
|
"reward_std": 0.11558076366782188, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.2500000037252903, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.3125, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 108 |
|
}, |
|
{ |
|
"completion_length": 81.08333587646484, |
|
"epoch": 0.05834336946340158, |
|
"grad_norm": 0.4293256103992462, |
|
"kl": 0.3195993173867464, |
|
"learning_rate": 3.5008725813922383e-06, |
|
"loss": 0.0128, |
|
"reward": 1.1666666865348816, |
|
"reward_std": 0.06454972922801971, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.1666666679084301, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.5, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 109 |
|
}, |
|
{ |
|
"completion_length": 153.29166984558105, |
|
"epoch": 0.05887862973370801, |
|
"grad_norm": 1.0056191682815552, |
|
"kl": 0.20048995688557625, |
|
"learning_rate": 3.4687889661302577e-06, |
|
"loss": 0.008, |
|
"reward": 1.4843750298023224, |
|
"reward_std": 0.41315262764692307, |
|
"rewards/correctness_reward_func": 0.25, |
|
"rewards/int_reward_func": 0.37500000558793545, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.37500000558793545, |
|
"rewards/xmlcount_reward_func": 0.484375, |
|
"step": 110 |
|
}, |
|
{ |
|
"completion_length": 88.54167175292969, |
|
"epoch": 0.05941389000401445, |
|
"grad_norm": 1.9036917686462402, |
|
"kl": 0.2402002513408661, |
|
"learning_rate": 3.436516483539781e-06, |
|
"loss": 0.0096, |
|
"reward": 1.833333358168602, |
|
"reward_std": 0.5183059275150299, |
|
"rewards/correctness_reward_func": 0.5000000223517418, |
|
"rewards/int_reward_func": 0.39583333395421505, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4375000074505806, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 111 |
|
}, |
|
{ |
|
"completion_length": 73.37500190734863, |
|
"epoch": 0.059949150274320885, |
|
"grad_norm": 1.473577857017517, |
|
"kl": 0.31748900189995766, |
|
"learning_rate": 3.4040614252052305e-06, |
|
"loss": 0.0127, |
|
"reward": 1.3645833730697632, |
|
"reward_std": 0.3358423411846161, |
|
"rewards/correctness_reward_func": 0.0833333358168602, |
|
"rewards/int_reward_func": 0.3541666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4375, |
|
"rewards/xmlcount_reward_func": 0.4895833358168602, |
|
"step": 112 |
|
}, |
|
{ |
|
"completion_length": 75.04166793823242, |
|
"epoch": 0.060484410544627325, |
|
"grad_norm": 0.9336056709289551, |
|
"kl": 0.29590417072176933, |
|
"learning_rate": 3.3714301183045382e-06, |
|
"loss": 0.0118, |
|
"reward": 1.270833358168602, |
|
"reward_std": 0.1530931033194065, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.3125, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4583333432674408, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 113 |
|
}, |
|
{ |
|
"completion_length": 71.25000190734863, |
|
"epoch": 0.061019670814933764, |
|
"grad_norm": 1.1127337217330933, |
|
"kl": 0.2616447024047375, |
|
"learning_rate": 3.338628924375638e-06, |
|
"loss": 0.0105, |
|
"reward": 1.291666716337204, |
|
"reward_std": 0.16661179810762405, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.3125000149011612, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4791666716337204, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 114 |
|
}, |
|
{ |
|
"completion_length": 75.50000286102295, |
|
"epoch": 0.0615549310852402, |
|
"grad_norm": 0.7082913517951965, |
|
"kl": 0.2341964803636074, |
|
"learning_rate": 3.3056642380762783e-06, |
|
"loss": 0.0094, |
|
"reward": 1.4791666865348816, |
|
"reward_std": 0.3881191611289978, |
|
"rewards/correctness_reward_func": 0.1666666716337204, |
|
"rewards/int_reward_func": 0.3333333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4791666716337204, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 115 |
|
}, |
|
{ |
|
"completion_length": 206.91667556762695, |
|
"epoch": 0.06209019135554664, |
|
"grad_norm": 0.41269803047180176, |
|
"kl": 0.18453531339764595, |
|
"learning_rate": 3.272542485937369e-06, |
|
"loss": 0.0074, |
|
"reward": 1.5364583730697632, |
|
"reward_std": 0.7113124281167984, |
|
"rewards/correctness_reward_func": 0.3333333432674408, |
|
"rewards/int_reward_func": 0.4375000074505806, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.3125000037252903, |
|
"rewards/xmlcount_reward_func": 0.453125, |
|
"step": 116 |
|
}, |
|
{ |
|
"completion_length": 88.50000381469727, |
|
"epoch": 0.06262545162585308, |
|
"grad_norm": 0.8284731507301331, |
|
"kl": 0.18175287544727325, |
|
"learning_rate": 3.2392701251101172e-06, |
|
"loss": 0.0073, |
|
"reward": 1.3958333730697632, |
|
"reward_std": 0.3092299550771713, |
|
"rewards/correctness_reward_func": 0.1666666716337204, |
|
"rewards/int_reward_func": 0.375, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.3541666716337204, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 117 |
|
}, |
|
{ |
|
"completion_length": 151.66666984558105, |
|
"epoch": 0.0631607118961595, |
|
"grad_norm": 1.4958237409591675, |
|
"kl": 0.2391066513955593, |
|
"learning_rate": 3.205853642107192e-06, |
|
"loss": 0.0096, |
|
"reward": 1.1510416865348816, |
|
"reward_std": 0.5006890743970871, |
|
"rewards/correctness_reward_func": 0.1666666716337204, |
|
"rewards/int_reward_func": 0.22916666977107525, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.2708333432674408, |
|
"rewards/xmlcount_reward_func": 0.484375, |
|
"step": 118 |
|
}, |
|
{ |
|
"completion_length": 123.37500381469727, |
|
"epoch": 0.06369597216646594, |
|
"grad_norm": 0.7593151926994324, |
|
"kl": 0.23146136105060577, |
|
"learning_rate": 3.1722995515381644e-06, |
|
"loss": 0.0093, |
|
"reward": 1.4270833432674408, |
|
"reward_std": 0.48038505017757416, |
|
"rewards/correctness_reward_func": 0.1666666716337204, |
|
"rewards/int_reward_func": 0.3750000037252903, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4166666716337204, |
|
"rewards/xmlcount_reward_func": 0.46875, |
|
"step": 119 |
|
}, |
|
{ |
|
"completion_length": 123.75000762939453, |
|
"epoch": 0.06423123243677238, |
|
"grad_norm": 0.3002820909023285, |
|
"kl": 0.18683998472988605, |
|
"learning_rate": 3.1386143948394764e-06, |
|
"loss": 0.0075, |
|
"reward": 1.3802083730697632, |
|
"reward_std": 0.2934284619987011, |
|
"rewards/correctness_reward_func": 0.4166666865348816, |
|
"rewards/int_reward_func": 0.1041666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.375, |
|
"rewards/xmlcount_reward_func": 0.484375, |
|
"step": 120 |
|
}, |
|
{ |
|
"completion_length": 71.62500190734863, |
|
"epoch": 0.06476649270707882, |
|
"grad_norm": 0.894112229347229, |
|
"kl": 0.2145114541053772, |
|
"learning_rate": 3.1048047389991693e-06, |
|
"loss": 0.0086, |
|
"reward": 1.2291666865348816, |
|
"reward_std": 0.05103103443980217, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.25, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4791666716337204, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 121 |
|
}, |
|
{ |
|
"completion_length": 77.50000095367432, |
|
"epoch": 0.06530175297738525, |
|
"grad_norm": 0.9729853272438049, |
|
"kl": 0.25685518980026245, |
|
"learning_rate": 3.0708771752766397e-06, |
|
"loss": 0.0103, |
|
"reward": 1.645833358168602, |
|
"reward_std": 0.25515517219901085, |
|
"rewards/correctness_reward_func": 0.4166666865348816, |
|
"rewards/int_reward_func": 0.25, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4791666716337204, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 122 |
|
}, |
|
{ |
|
"completion_length": 115.25000381469727, |
|
"epoch": 0.06583701324769169, |
|
"grad_norm": 0.60560142993927, |
|
"kl": 0.26780444383621216, |
|
"learning_rate": 3.0368383179176584e-06, |
|
"loss": 0.0107, |
|
"reward": 1.3333333432674408, |
|
"reward_std": 0.06454972922801971, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.375, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4583333358168602, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 123 |
|
}, |
|
{ |
|
"completion_length": 65.25000190734863, |
|
"epoch": 0.06637227351799813, |
|
"grad_norm": 0.47688770294189453, |
|
"kl": 0.2243332415819168, |
|
"learning_rate": 3.002694802864912e-06, |
|
"loss": 0.009, |
|
"reward": 1.2291666865348816, |
|
"reward_std": 0.05103103816509247, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.25, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4791666716337204, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 124 |
|
}, |
|
{ |
|
"completion_length": 349.833345413208, |
|
"epoch": 0.06690753378830457, |
|
"grad_norm": 0.6750498414039612, |
|
"kl": 0.27263053273782134, |
|
"learning_rate": 2.9684532864643123e-06, |
|
"loss": 0.0109, |
|
"reward": 1.5260833650827408, |
|
"reward_std": 0.4318152070045471, |
|
"rewards/correctness_reward_func": 0.4166666865348816, |
|
"rewards/int_reward_func": 0.3333333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.3541666716337204, |
|
"rewards/xmlcount_reward_func": 0.421916663646698, |
|
"step": 125 |
|
}, |
|
{ |
|
"completion_length": 186.62500762939453, |
|
"epoch": 0.067442794058611, |
|
"grad_norm": 0.8187605142593384, |
|
"kl": 0.2676307410001755, |
|
"learning_rate": 2.9341204441673267e-06, |
|
"loss": 0.0107, |
|
"reward": 1.55266672372818, |
|
"reward_std": 0.4699760675430298, |
|
"rewards/correctness_reward_func": 0.25, |
|
"rewards/int_reward_func": 0.4166666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4375000074505806, |
|
"rewards/xmlcount_reward_func": 0.44849999994039536, |
|
"step": 126 |
|
}, |
|
{ |
|
"completion_length": 72.50000190734863, |
|
"epoch": 0.06797805432891743, |
|
"grad_norm": 0.06586437672376633, |
|
"kl": 0.25656602531671524, |
|
"learning_rate": 2.8997029692295875e-06, |
|
"loss": 0.0103, |
|
"reward": 1.375, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.375, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.5, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 127 |
|
}, |
|
{ |
|
"completion_length": 67.66666889190674, |
|
"epoch": 0.06851331459922387, |
|
"grad_norm": 0.13435645401477814, |
|
"kl": 0.23636912554502487, |
|
"learning_rate": 2.8652075714060296e-06, |
|
"loss": 0.0095, |
|
"reward": 1.25, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.25, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.5, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 128 |
|
}, |
|
{ |
|
"completion_length": 71.12500381469727, |
|
"epoch": 0.06904857486953031, |
|
"grad_norm": 0.6717380285263062, |
|
"kl": 0.3070458807051182, |
|
"learning_rate": 2.8306409756428067e-06, |
|
"loss": 0.0123, |
|
"reward": 1.4375000298023224, |
|
"reward_std": 0.11558076366782188, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.4583333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4791666716337204, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 129 |
|
}, |
|
{ |
|
"completion_length": 73.16666889190674, |
|
"epoch": 0.06958383513983675, |
|
"grad_norm": 0.9400418996810913, |
|
"kl": 0.22653049230575562, |
|
"learning_rate": 2.7960099207662535e-06, |
|
"loss": 0.0091, |
|
"reward": 2.0000000596046448, |
|
"reward_std": 0.556085180491209, |
|
"rewards/correctness_reward_func": 0.6666666865348816, |
|
"rewards/int_reward_func": 0.375, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4583333432674408, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 130 |
|
}, |
|
{ |
|
"completion_length": 71.66666889190674, |
|
"epoch": 0.07011909541014318, |
|
"grad_norm": 0.6696528792381287, |
|
"kl": 0.26699281856417656, |
|
"learning_rate": 2.761321158169134e-06, |
|
"loss": 0.0107, |
|
"reward": 1.3958333730697632, |
|
"reward_std": 0.25515518337488174, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.4583333432674408, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4375000149011612, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 131 |
|
}, |
|
{ |
|
"completion_length": 98.91666984558105, |
|
"epoch": 0.07065435568044962, |
|
"grad_norm": 0.8539110422134399, |
|
"kl": 0.23305394127964973, |
|
"learning_rate": 2.726581450494451e-06, |
|
"loss": 0.0093, |
|
"reward": 1.1562500298023224, |
|
"reward_std": 0.22562336921691895, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.2916666679084301, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.3750000037252903, |
|
"rewards/xmlcount_reward_func": 0.4895833358168602, |
|
"step": 132 |
|
}, |
|
{ |
|
"completion_length": 76.66666984558105, |
|
"epoch": 0.07118961595075605, |
|
"grad_norm": 0.2167089283466339, |
|
"kl": 0.20909808576107025, |
|
"learning_rate": 2.6917975703170466e-06, |
|
"loss": 0.0084, |
|
"reward": 1.25, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.25, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.5, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 133 |
|
}, |
|
{ |
|
"completion_length": 69.50000190734863, |
|
"epoch": 0.0717248762210625, |
|
"grad_norm": 1.07498037815094, |
|
"kl": 0.2273651361465454, |
|
"learning_rate": 2.6569762988232838e-06, |
|
"loss": 0.0091, |
|
"reward": 1.2291666865348816, |
|
"reward_std": 0.11558076366782188, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.22916666977107525, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.5, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 134 |
|
}, |
|
{ |
|
"completion_length": 78.87500190734863, |
|
"epoch": 0.07226013649136893, |
|
"grad_norm": 1.082901954650879, |
|
"kl": 0.21143031865358353, |
|
"learning_rate": 2.6221244244890336e-06, |
|
"loss": 0.0085, |
|
"reward": 1.2691666781902313, |
|
"reward_std": 0.16395629942417145, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.375, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4166666716337204, |
|
"rewards/xmlcount_reward_func": 0.4775000065565109, |
|
"step": 135 |
|
}, |
|
{ |
|
"completion_length": 84.79166793823242, |
|
"epoch": 0.07279539676167536, |
|
"grad_norm": 0.5148155093193054, |
|
"kl": 0.19925828650593758, |
|
"learning_rate": 2.587248741756253e-06, |
|
"loss": 0.008, |
|
"reward": 1.375, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.375, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.5, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 136 |
|
}, |
|
{ |
|
"completion_length": 102.62500190734863, |
|
"epoch": 0.0733306570319818, |
|
"grad_norm": 0.6643544435501099, |
|
"kl": 0.1562279723584652, |
|
"learning_rate": 2.5523560497083927e-06, |
|
"loss": 0.0062, |
|
"reward": 1.248750001192093, |
|
"reward_std": 0.2717357352375984, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.3333333432674408, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4375000149011612, |
|
"rewards/xmlcount_reward_func": 0.4779166728258133, |
|
"step": 137 |
|
}, |
|
{ |
|
"completion_length": 67.0416669845581, |
|
"epoch": 0.07386591730228824, |
|
"grad_norm": 0.4721544086933136, |
|
"kl": 0.19591450318694115, |
|
"learning_rate": 2.517453150744904e-06, |
|
"loss": 0.0078, |
|
"reward": 1.3541666865348816, |
|
"reward_std": 0.05103103816509247, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.375, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4791666716337204, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 138 |
|
}, |
|
{ |
|
"completion_length": 79.37500190734863, |
|
"epoch": 0.07440117757259468, |
|
"grad_norm": 0.4948488771915436, |
|
"kl": 0.20949136465787888, |
|
"learning_rate": 2.482546849255096e-06, |
|
"loss": 0.0084, |
|
"reward": 1.5416666865348816, |
|
"reward_std": 0.25819891691207886, |
|
"rewards/correctness_reward_func": 0.1666666716337204, |
|
"rewards/int_reward_func": 0.375, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.5, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 139 |
|
}, |
|
{ |
|
"completion_length": 64.29166889190674, |
|
"epoch": 0.0749364378429011, |
|
"grad_norm": 0.8793404698371887, |
|
"kl": 0.22966529056429863, |
|
"learning_rate": 2.447643950291608e-06, |
|
"loss": 0.0092, |
|
"reward": 1.3541666865348816, |
|
"reward_std": 0.25515519082546234, |
|
"rewards/correctness_reward_func": 0.0833333358168602, |
|
"rewards/int_reward_func": 0.27083333395421505, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.5, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 140 |
|
}, |
|
{ |
|
"completion_length": 139.62500190734863, |
|
"epoch": 0.07547169811320754, |
|
"grad_norm": 0.5459631681442261, |
|
"kl": 0.19374394416809082, |
|
"learning_rate": 2.4127512582437486e-06, |
|
"loss": 0.0077, |
|
"reward": 1.177083358168602, |
|
"reward_std": 0.18890930339694023, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.2916666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4166666679084301, |
|
"rewards/xmlcount_reward_func": 0.46875, |
|
"step": 141 |
|
}, |
|
{ |
|
"completion_length": 153.16666984558105, |
|
"epoch": 0.07600695838351398, |
|
"grad_norm": 0.9452311396598816, |
|
"kl": 0.1499454267323017, |
|
"learning_rate": 2.377875575510967e-06, |
|
"loss": 0.006, |
|
"reward": 1.7239583730697632, |
|
"reward_std": 0.8880488127470016, |
|
"rewards/correctness_reward_func": 0.666666679084301, |
|
"rewards/int_reward_func": 0.3958333432674408, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.25, |
|
"rewards/xmlcount_reward_func": 0.4114583358168602, |
|
"step": 142 |
|
}, |
|
{ |
|
"completion_length": 61.29166793823242, |
|
"epoch": 0.07654221865382042, |
|
"grad_norm": 0.09438279271125793, |
|
"kl": 0.3462410867214203, |
|
"learning_rate": 2.3430237011767166e-06, |
|
"loss": 0.0138, |
|
"reward": 1.5, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.5, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.5, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 143 |
|
}, |
|
{ |
|
"completion_length": 88.37500286102295, |
|
"epoch": 0.07707747892412686, |
|
"grad_norm": 1.7993136644363403, |
|
"kl": 0.32219041138887405, |
|
"learning_rate": 2.3082024296829538e-06, |
|
"loss": 0.0129, |
|
"reward": 1.4218750596046448, |
|
"reward_std": 0.4565740302205086, |
|
"rewards/correctness_reward_func": 0.1666666716337204, |
|
"rewards/int_reward_func": 0.37500000558793545, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.3958333358168602, |
|
"rewards/xmlcount_reward_func": 0.484375, |
|
"step": 144 |
|
}, |
|
{ |
|
"completion_length": 72.62500381469727, |
|
"epoch": 0.07761273919443329, |
|
"grad_norm": 0.09042877703905106, |
|
"kl": 0.20896168053150177, |
|
"learning_rate": 2.2734185495055503e-06, |
|
"loss": 0.0084, |
|
"reward": 1.125, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.125, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.5, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 145 |
|
}, |
|
{ |
|
"completion_length": 163.75000190734863, |
|
"epoch": 0.07814799946473973, |
|
"grad_norm": 1.1228388547897339, |
|
"kl": 0.21238887682557106, |
|
"learning_rate": 2.238678841830867e-06, |
|
"loss": 0.0085, |
|
"reward": 1.1927083730697632, |
|
"reward_std": 0.2637527585029602, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.3333333432674408, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.3750000074505806, |
|
"rewards/xmlcount_reward_func": 0.484375, |
|
"step": 146 |
|
}, |
|
{ |
|
"completion_length": 71.95833683013916, |
|
"epoch": 0.07868325973504617, |
|
"grad_norm": 0.661372721195221, |
|
"kl": 0.3126152493059635, |
|
"learning_rate": 2.2039900792337477e-06, |
|
"loss": 0.0125, |
|
"reward": 1.4375, |
|
"reward_std": 0.22008520364761353, |
|
"rewards/correctness_reward_func": 0.0833333358168602, |
|
"rewards/int_reward_func": 0.39583333395421505, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4583333358168602, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 147 |
|
}, |
|
{ |
|
"completion_length": 72.33333492279053, |
|
"epoch": 0.07921852000535261, |
|
"grad_norm": 0.6126328706741333, |
|
"kl": 0.18934645876288414, |
|
"learning_rate": 2.1693590243571937e-06, |
|
"loss": 0.0076, |
|
"reward": 1.8333333432674408, |
|
"reward_std": 0.20412413775920868, |
|
"rewards/correctness_reward_func": 0.5833333358168602, |
|
"rewards/int_reward_func": 0.25, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.5, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 148 |
|
}, |
|
{ |
|
"completion_length": 103.37500381469727, |
|
"epoch": 0.07975378027565903, |
|
"grad_norm": 0.8722951412200928, |
|
"kl": 0.14480971172451973, |
|
"learning_rate": 2.134792428593971e-06, |
|
"loss": 0.0058, |
|
"reward": 1.895833432674408, |
|
"reward_std": 0.5449064522981644, |
|
"rewards/correctness_reward_func": 0.5000000149011612, |
|
"rewards/int_reward_func": 0.4791666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4166666716337204, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 149 |
|
}, |
|
{ |
|
"completion_length": 69.54166984558105, |
|
"epoch": 0.08028904054596547, |
|
"grad_norm": 0.7660404443740845, |
|
"kl": 0.20250581949949265, |
|
"learning_rate": 2.1002970307704134e-06, |
|
"loss": 0.0081, |
|
"reward": 1.3958333432674408, |
|
"reward_std": 0.35721728205680847, |
|
"rewards/correctness_reward_func": 0.1666666716337204, |
|
"rewards/int_reward_func": 0.25, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4791666716337204, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 150 |
|
}, |
|
{ |
|
"completion_length": 180.7083396911621, |
|
"epoch": 0.08082430081627191, |
|
"grad_norm": 0.5615190267562866, |
|
"kl": 0.23339027352631092, |
|
"learning_rate": 2.0658795558326745e-06, |
|
"loss": 0.0093, |
|
"reward": 1.5729166865348816, |
|
"reward_std": 0.31237732619047165, |
|
"rewards/correctness_reward_func": 0.4166666865348816, |
|
"rewards/int_reward_func": 0.25, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4375, |
|
"rewards/xmlcount_reward_func": 0.46875, |
|
"step": 151 |
|
}, |
|
{ |
|
"completion_length": 63.83333396911621, |
|
"epoch": 0.08135956108657835, |
|
"grad_norm": 0.1392926722764969, |
|
"kl": 0.26523152738809586, |
|
"learning_rate": 2.031546713535688e-06, |
|
"loss": 0.0106, |
|
"reward": 1.375, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.375, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.5, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 152 |
|
}, |
|
{ |
|
"completion_length": 81.08333778381348, |
|
"epoch": 0.08189482135688479, |
|
"grad_norm": 1.1122794151306152, |
|
"kl": 0.25071796402335167, |
|
"learning_rate": 1.997305197135089e-06, |
|
"loss": 0.01, |
|
"reward": 1.5200416892766953, |
|
"reward_std": 0.3365423232316971, |
|
"rewards/correctness_reward_func": 0.5, |
|
"rewards/int_reward_func": 0.1875, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.354166679084301, |
|
"rewards/xmlcount_reward_func": 0.47837500274181366, |
|
"step": 153 |
|
}, |
|
{ |
|
"completion_length": 93.37500381469727, |
|
"epoch": 0.08243008162719122, |
|
"grad_norm": 1.3372734785079956, |
|
"kl": 0.19058941677212715, |
|
"learning_rate": 1.963161682082342e-06, |
|
"loss": 0.0076, |
|
"reward": 1.6250000298023224, |
|
"reward_std": 0.47279806435108185, |
|
"rewards/correctness_reward_func": 0.5000000223517418, |
|
"rewards/int_reward_func": 0.22916666977107525, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.39583333395421505, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 154 |
|
}, |
|
{ |
|
"completion_length": 104.83333396911621, |
|
"epoch": 0.08296534189749766, |
|
"grad_norm": 2.2292211055755615, |
|
"kl": 0.16792716644704342, |
|
"learning_rate": 1.9291228247233607e-06, |
|
"loss": 0.0067, |
|
"reward": 2.0000000596046448, |
|
"reward_std": 0.4289814233779907, |
|
"rewards/correctness_reward_func": 0.583333358168602, |
|
"rewards/int_reward_func": 0.5, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4166666679084301, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 155 |
|
}, |
|
{ |
|
"completion_length": 90.87500476837158, |
|
"epoch": 0.0835006021678041, |
|
"grad_norm": 0.4122345745563507, |
|
"kl": 0.18274019937962294, |
|
"learning_rate": 1.895195261000831e-06, |
|
"loss": 0.0073, |
|
"reward": 1.2708333432674408, |
|
"reward_std": 0.05103103443980217, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.27083333395421505, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.5, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 156 |
|
}, |
|
{ |
|
"completion_length": 172.45833587646484, |
|
"epoch": 0.08403586243811054, |
|
"grad_norm": 0.8472205400466919, |
|
"kl": 0.2094765491783619, |
|
"learning_rate": 1.8613856051605242e-06, |
|
"loss": 0.0084, |
|
"reward": 1.2343750298023224, |
|
"reward_std": 0.2048850804567337, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.3541666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.3958333395421505, |
|
"rewards/xmlcount_reward_func": 0.484375, |
|
"step": 157 |
|
}, |
|
{ |
|
"completion_length": 127.00000762939453, |
|
"epoch": 0.08457112270841696, |
|
"grad_norm": 0.6804733276367188, |
|
"kl": 0.20304612442851067, |
|
"learning_rate": 1.827700448461836e-06, |
|
"loss": 0.0081, |
|
"reward": 1.192708358168602, |
|
"reward_std": 0.17030073329806328, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.25000000558793545, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4583333358168602, |
|
"rewards/xmlcount_reward_func": 0.484375, |
|
"step": 158 |
|
}, |
|
{ |
|
"completion_length": 85.29166793823242, |
|
"epoch": 0.0851063829787234, |
|
"grad_norm": 0.7838725447654724, |
|
"kl": 0.29594049230217934, |
|
"learning_rate": 1.7941463578928088e-06, |
|
"loss": 0.0118, |
|
"reward": 1.2708333432674408, |
|
"reward_std": 0.16913224011659622, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.3541666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4375000074505806, |
|
"rewards/xmlcount_reward_func": 0.4791666716337204, |
|
"step": 159 |
|
}, |
|
{ |
|
"completion_length": 126.00000381469727, |
|
"epoch": 0.08564164324902984, |
|
"grad_norm": 1.384969711303711, |
|
"kl": 0.16587615385651588, |
|
"learning_rate": 1.7607298748898844e-06, |
|
"loss": 0.0066, |
|
"reward": 1.4843750149011612, |
|
"reward_std": 0.5336930006742477, |
|
"rewards/correctness_reward_func": 0.25, |
|
"rewards/int_reward_func": 0.2916666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4583333432674408, |
|
"rewards/xmlcount_reward_func": 0.484375, |
|
"step": 160 |
|
}, |
|
{ |
|
"completion_length": 72.12500286102295, |
|
"epoch": 0.08617690351933628, |
|
"grad_norm": 0.0433061420917511, |
|
"kl": 0.1850288063287735, |
|
"learning_rate": 1.7274575140626318e-06, |
|
"loss": 0.0074, |
|
"reward": 1.125, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.125, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.5, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 161 |
|
}, |
|
{ |
|
"completion_length": 174.58334159851074, |
|
"epoch": 0.08671216378964271, |
|
"grad_norm": 0.8271235823631287, |
|
"kl": 0.18862449377775192, |
|
"learning_rate": 1.6943357619237227e-06, |
|
"loss": 0.0075, |
|
"reward": 1.2083333432674408, |
|
"reward_std": 0.30994437262415886, |
|
"rewards/correctness_reward_func": 0.0833333358168602, |
|
"rewards/int_reward_func": 0.3333333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.3333333432674408, |
|
"rewards/xmlcount_reward_func": 0.4583333358168602, |
|
"step": 162 |
|
}, |
|
{ |
|
"completion_length": 78.20833587646484, |
|
"epoch": 0.08724742405994915, |
|
"grad_norm": 0.908470869064331, |
|
"kl": 0.1706334725022316, |
|
"learning_rate": 1.661371075624363e-06, |
|
"loss": 0.0068, |
|
"reward": 1.2708333432674408, |
|
"reward_std": 0.33958156406879425, |
|
"rewards/correctness_reward_func": 0.0833333358168602, |
|
"rewards/int_reward_func": 0.27083333395421505, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4166666716337204, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 163 |
|
}, |
|
{ |
|
"completion_length": 137.91666984558105, |
|
"epoch": 0.08778268433025559, |
|
"grad_norm": 0.7333254814147949, |
|
"kl": 0.1247784998267889, |
|
"learning_rate": 1.6285698816954626e-06, |
|
"loss": 0.005, |
|
"reward": 1.5208333432674408, |
|
"reward_std": 0.767971470952034, |
|
"rewards/correctness_reward_func": 0.3333333432674408, |
|
"rewards/int_reward_func": 0.29166667349636555, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.39583333395421505, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 164 |
|
}, |
|
{ |
|
"completion_length": 78.08333683013916, |
|
"epoch": 0.08831794460056203, |
|
"grad_norm": 0.8487056493759155, |
|
"kl": 0.22962494008243084, |
|
"learning_rate": 1.5959385747947697e-06, |
|
"loss": 0.0092, |
|
"reward": 1.2916666865348816, |
|
"reward_std": 0.14360667020082474, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.31250000186264515, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4791666716337204, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 165 |
|
}, |
|
{ |
|
"completion_length": 200.79166793823242, |
|
"epoch": 0.08885320487086847, |
|
"grad_norm": 1.4497365951538086, |
|
"kl": 0.1650528460741043, |
|
"learning_rate": 1.56348351646022e-06, |
|
"loss": 0.0066, |
|
"reward": 1.4895834028720856, |
|
"reward_std": 0.5747665874660015, |
|
"rewards/correctness_reward_func": 0.1666666716337204, |
|
"rewards/int_reward_func": 0.4375000074505806, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4166666716337204, |
|
"rewards/xmlcount_reward_func": 0.46875, |
|
"step": 166 |
|
}, |
|
{ |
|
"completion_length": 176.12500762939453, |
|
"epoch": 0.08938846514117489, |
|
"grad_norm": 0.4245186746120453, |
|
"kl": 0.15275901928544044, |
|
"learning_rate": 1.5312110338697427e-06, |
|
"loss": 0.0061, |
|
"reward": 1.1822916865348816, |
|
"reward_std": 0.3088150769472122, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.3125000074505806, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4166666716337204, |
|
"rewards/xmlcount_reward_func": 0.453125, |
|
"step": 167 |
|
}, |
|
{ |
|
"completion_length": 312.9583396911621, |
|
"epoch": 0.08992372541148133, |
|
"grad_norm": 0.7641220092773438, |
|
"kl": 0.14276206120848656, |
|
"learning_rate": 1.4991274186077632e-06, |
|
"loss": 0.0057, |
|
"reward": 1.166666716337204, |
|
"reward_std": 0.49721667170524597, |
|
"rewards/correctness_reward_func": 0.0833333358168602, |
|
"rewards/int_reward_func": 0.22916666977107525, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4166666679084301, |
|
"rewards/xmlcount_reward_func": 0.4375, |
|
"step": 168 |
|
}, |
|
{ |
|
"completion_length": 101.45833778381348, |
|
"epoch": 0.09045898568178777, |
|
"grad_norm": 0.7765936255455017, |
|
"kl": 0.1949683390557766, |
|
"learning_rate": 1.467238925438646e-06, |
|
"loss": 0.0078, |
|
"reward": 1.2916666865348816, |
|
"reward_std": 0.28610818088054657, |
|
"rewards/correctness_reward_func": 0.0833333358168602, |
|
"rewards/int_reward_func": 0.3125, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.3958333395421505, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 169 |
|
}, |
|
{ |
|
"completion_length": 172.45833778381348, |
|
"epoch": 0.09099424595209421, |
|
"grad_norm": 0.41969409584999084, |
|
"kl": 0.13137296214699745, |
|
"learning_rate": 1.4355517710873184e-06, |
|
"loss": 0.0053, |
|
"reward": 2.114583343267441, |
|
"reward_std": 0.5827288627624512, |
|
"rewards/correctness_reward_func": 0.8333333358168602, |
|
"rewards/int_reward_func": 0.4583333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.3541666716337204, |
|
"rewards/xmlcount_reward_func": 0.46875, |
|
"step": 170 |
|
}, |
|
{ |
|
"completion_length": 79.95833587646484, |
|
"epoch": 0.09152950622240064, |
|
"grad_norm": 0.16617827117443085, |
|
"kl": 0.16961714625358582, |
|
"learning_rate": 1.4040721330273063e-06, |
|
"loss": 0.0068, |
|
"reward": 1.25, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.25, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.5, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 171 |
|
}, |
|
{ |
|
"completion_length": 80.87500190734863, |
|
"epoch": 0.09206476649270708, |
|
"grad_norm": 0.36631259322166443, |
|
"kl": 0.1460169106721878, |
|
"learning_rate": 1.3728061482764238e-06, |
|
"loss": 0.0058, |
|
"reward": 1.3333333432674408, |
|
"reward_std": 0.06454972922801971, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.3333333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.5, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 172 |
|
}, |
|
{ |
|
"completion_length": 97.12500190734863, |
|
"epoch": 0.09260002676301352, |
|
"grad_norm": 1.1213421821594238, |
|
"kl": 0.14688214287161827, |
|
"learning_rate": 1.3417599122003464e-06, |
|
"loss": 0.0059, |
|
"reward": 1.2291666865348816, |
|
"reward_std": 0.05103103816509247, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.2291666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.5, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 173 |
|
}, |
|
{ |
|
"completion_length": 85.58333587646484, |
|
"epoch": 0.09313528703331996, |
|
"grad_norm": 0.7063998579978943, |
|
"kl": 0.17591003328561783, |
|
"learning_rate": 1.3109394773243117e-06, |
|
"loss": 0.007, |
|
"reward": 1.3958333730697632, |
|
"reward_std": 0.11558076739311218, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.3958333395421505, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.5, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 174 |
|
}, |
|
{ |
|
"completion_length": 89.04166793823242, |
|
"epoch": 0.0936705473036264, |
|
"grad_norm": 0.7872856855392456, |
|
"kl": 0.17240377515554428, |
|
"learning_rate": 1.280350852153168e-06, |
|
"loss": 0.0069, |
|
"reward": 1.2500000447034836, |
|
"reward_std": 0.2728445753455162, |
|
"rewards/correctness_reward_func": 0.0833333358168602, |
|
"rewards/int_reward_func": 0.2083333432674408, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4583333432674408, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 175 |
|
}, |
|
{ |
|
"completion_length": 252.00000762939453, |
|
"epoch": 0.09420580757393282, |
|
"grad_norm": 1.0295542478561401, |
|
"kl": 0.20453453436493874, |
|
"learning_rate": 1.2500000000000007e-06, |
|
"loss": 0.0082, |
|
"reward": 1.3854167014360428, |
|
"reward_std": 0.5866826139390469, |
|
"rewards/correctness_reward_func": 0.1666666716337204, |
|
"rewards/int_reward_func": 0.3958333432674408, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.35416666977107525, |
|
"rewards/xmlcount_reward_func": 0.46875, |
|
"step": 176 |
|
}, |
|
{ |
|
"completion_length": 147.75000381469727, |
|
"epoch": 0.09474106784423926, |
|
"grad_norm": 0.834882915019989, |
|
"kl": 0.134497981518507, |
|
"learning_rate": 1.2198928378235717e-06, |
|
"loss": 0.0054, |
|
"reward": 1.4427083730697632, |
|
"reward_std": 0.5198761932551861, |
|
"rewards/correctness_reward_func": 0.1666666716337204, |
|
"rewards/int_reward_func": 0.37500000558793545, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4166666716337204, |
|
"rewards/xmlcount_reward_func": 0.484375, |
|
"step": 177 |
|
}, |
|
{ |
|
"completion_length": 100.08333778381348, |
|
"epoch": 0.0952763281145457, |
|
"grad_norm": 1.221563458442688, |
|
"kl": 0.16201673820614815, |
|
"learning_rate": 1.1900352350748026e-06, |
|
"loss": 0.0065, |
|
"reward": 1.338541716337204, |
|
"reward_std": 0.2559161148965359, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.3958333395421505, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4583333432674408, |
|
"rewards/xmlcount_reward_func": 0.484375, |
|
"step": 178 |
|
}, |
|
{ |
|
"completion_length": 127.83333969116211, |
|
"epoch": 0.09581158838485214, |
|
"grad_norm": 0.886989176273346, |
|
"kl": 0.15441275481134653, |
|
"learning_rate": 1.160433012552508e-06, |
|
"loss": 0.0062, |
|
"reward": 0.9375000149011612, |
|
"reward_std": 0.18744874745607376, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.0625, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.3750000037252903, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 179 |
|
}, |
|
{ |
|
"completion_length": 86.87500190734863, |
|
"epoch": 0.09634684865515857, |
|
"grad_norm": 1.1908172369003296, |
|
"kl": 0.15353485196828842, |
|
"learning_rate": 1.1310919412686248e-06, |
|
"loss": 0.0061, |
|
"reward": 1.0833333432674408, |
|
"reward_std": 0.10206207260489464, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.125, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4583333432674408, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 180 |
|
}, |
|
{ |
|
"completion_length": 89.58333587646484, |
|
"epoch": 0.096882108925465, |
|
"grad_norm": 1.44786536693573, |
|
"kl": 0.1816324070096016, |
|
"learning_rate": 1.1020177413231334e-06, |
|
"loss": 0.0073, |
|
"reward": 1.510416716337204, |
|
"reward_std": 0.472514558583498, |
|
"rewards/correctness_reward_func": 0.25, |
|
"rewards/int_reward_func": 0.3750000074505806, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.3958333358168602, |
|
"rewards/xmlcount_reward_func": 0.4895833358168602, |
|
"step": 181 |
|
}, |
|
{ |
|
"completion_length": 82.58333778381348, |
|
"epoch": 0.09741736919577144, |
|
"grad_norm": 0.13121654093265533, |
|
"kl": 0.18894518539309502, |
|
"learning_rate": 1.073216080788921e-06, |
|
"loss": 0.0076, |
|
"reward": 1.25, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.25, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.5, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 182 |
|
}, |
|
{ |
|
"completion_length": 221.0000123977661, |
|
"epoch": 0.09795262946607788, |
|
"grad_norm": 1.4435704946517944, |
|
"kl": 0.18115888815373182, |
|
"learning_rate": 1.0446925746067768e-06, |
|
"loss": 0.0072, |
|
"reward": 1.1875, |
|
"reward_std": 0.28912585973739624, |
|
"rewards/correctness_reward_func": 0.0833333358168602, |
|
"rewards/int_reward_func": 0.2916666679084301, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.375, |
|
"rewards/xmlcount_reward_func": 0.4375, |
|
"step": 183 |
|
}, |
|
{ |
|
"completion_length": 125.9583387374878, |
|
"epoch": 0.09848788973638432, |
|
"grad_norm": 1.2450430393218994, |
|
"kl": 0.14993033185601234, |
|
"learning_rate": 1.0164527834907468e-06, |
|
"loss": 0.006, |
|
"reward": 1.6093750596046448, |
|
"reward_std": 0.7003048211336136, |
|
"rewards/correctness_reward_func": 0.583333358168602, |
|
"rewards/int_reward_func": 0.1875000074505806, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.3541666716337204, |
|
"rewards/xmlcount_reward_func": 0.484375, |
|
"step": 184 |
|
}, |
|
{ |
|
"completion_length": 250.50000381469727, |
|
"epoch": 0.09902315000669075, |
|
"grad_norm": 0.6676502823829651, |
|
"kl": 0.14374011009931564, |
|
"learning_rate": 9.88502212844063e-07, |
|
"loss": 0.0058, |
|
"reward": 1.6502083837985992, |
|
"reward_std": 0.4829741967841983, |
|
"rewards/correctness_reward_func": 0.6666666716337204, |
|
"rewards/int_reward_func": 0.2916666679084301, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.2708333395421505, |
|
"rewards/xmlcount_reward_func": 0.42104167491197586, |
|
"step": 185 |
|
}, |
|
{ |
|
"completion_length": 95.41666984558105, |
|
"epoch": 0.09955841027699719, |
|
"grad_norm": 0.14050611853599548, |
|
"kl": 0.19362322241067886, |
|
"learning_rate": 9.608463116858544e-07, |
|
"loss": 0.0077, |
|
"reward": 1.5, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 0.5, |
|
"rewards/int_reward_func": 0.125, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.375, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 186 |
|
}, |
|
{ |
|
"completion_length": 69.25000190734863, |
|
"epoch": 0.10009367054730363, |
|
"grad_norm": 0.7105045318603516, |
|
"kl": 0.22476506605744362, |
|
"learning_rate": 9.334904715888496e-07, |
|
"loss": 0.009, |
|
"reward": 1.2291666865348816, |
|
"reward_std": 0.05103103443980217, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.2291666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.5, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 187 |
|
}, |
|
{ |
|
"completion_length": 81.08333778381348, |
|
"epoch": 0.10062893081761007, |
|
"grad_norm": 1.2536767721176147, |
|
"kl": 0.20191873610019684, |
|
"learning_rate": 9.064400256282757e-07, |
|
"loss": 0.0081, |
|
"reward": 1.0000000298023224, |
|
"reward_std": 0.20479072630405426, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.10416666977107525, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4166666716337204, |
|
"rewards/xmlcount_reward_func": 0.4791666716337204, |
|
"step": 188 |
|
}, |
|
{ |
|
"completion_length": 79.95833587646484, |
|
"epoch": 0.1011641910879165, |
|
"grad_norm": 0.1358516663312912, |
|
"kl": 0.2150093950331211, |
|
"learning_rate": 8.797002473421729e-07, |
|
"loss": 0.0086, |
|
"reward": 1.375, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.375, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.5, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 189 |
|
}, |
|
{ |
|
"completion_length": 255.83334350585938, |
|
"epoch": 0.10169945135822293, |
|
"grad_norm": 0.8230968713760376, |
|
"kl": 0.1948665827512741, |
|
"learning_rate": 8.532763497032987e-07, |
|
"loss": 0.0078, |
|
"reward": 1.1041666865348816, |
|
"reward_std": 0.3931647092103958, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.2500000074505806, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4166666716337204, |
|
"rewards/xmlcount_reward_func": 0.4375, |
|
"step": 190 |
|
}, |
|
{ |
|
"completion_length": 72.45833492279053, |
|
"epoch": 0.10223471162852937, |
|
"grad_norm": 0.6617278456687927, |
|
"kl": 0.17371252551674843, |
|
"learning_rate": 8.271734841028553e-07, |
|
"loss": 0.0069, |
|
"reward": 1.4166666865348816, |
|
"reward_std": 0.4518480896949768, |
|
"rewards/correctness_reward_func": 0.2500000074505806, |
|
"rewards/int_reward_func": 0.1875, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4791666716337204, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 191 |
|
}, |
|
{ |
|
"completion_length": 92.29166889190674, |
|
"epoch": 0.10276997189883581, |
|
"grad_norm": 0.7337960600852966, |
|
"kl": 0.18250929936766624, |
|
"learning_rate": 8.013967393462094e-07, |
|
"loss": 0.0073, |
|
"reward": 1.4375000596046448, |
|
"reward_std": 0.4927079305052757, |
|
"rewards/correctness_reward_func": 0.25, |
|
"rewards/int_reward_func": 0.2708333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4166666716337204, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 192 |
|
}, |
|
{ |
|
"completion_length": 93.79166984558105, |
|
"epoch": 0.10330523216914224, |
|
"grad_norm": 1.3416187763214111, |
|
"kl": 0.19713782332837582, |
|
"learning_rate": 7.759511406608255e-07, |
|
"loss": 0.0079, |
|
"reward": 1.5156250596046448, |
|
"reward_std": 0.5118480771780014, |
|
"rewards/correctness_reward_func": 0.25, |
|
"rewards/int_reward_func": 0.37500000558793545, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.3958333432674408, |
|
"rewards/xmlcount_reward_func": 0.4947916716337204, |
|
"step": 193 |
|
}, |
|
{ |
|
"completion_length": 96.08333969116211, |
|
"epoch": 0.10384049243944868, |
|
"grad_norm": 0.9250803589820862, |
|
"kl": 0.21085454896092415, |
|
"learning_rate": 7.508416487165862e-07, |
|
"loss": 0.0084, |
|
"reward": 1.8125000596046448, |
|
"reward_std": 0.31970491632819176, |
|
"rewards/correctness_reward_func": 0.4166666865348816, |
|
"rewards/int_reward_func": 0.4166666679084301, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4791666716337204, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 194 |
|
}, |
|
{ |
|
"completion_length": 74.00000190734863, |
|
"epoch": 0.10437575270975512, |
|
"grad_norm": 1.441701889038086, |
|
"kl": 0.18313675373792648, |
|
"learning_rate": 7.260731586586983e-07, |
|
"loss": 0.0073, |
|
"reward": 1.7500000298023224, |
|
"reward_std": 0.3624359965324402, |
|
"rewards/correctness_reward_func": 0.3333333432674408, |
|
"rewards/int_reward_func": 0.4583333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4583333432674408, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 195 |
|
}, |
|
{ |
|
"completion_length": 116.54167175292969, |
|
"epoch": 0.10491101298006156, |
|
"grad_norm": 1.0711389780044556, |
|
"kl": 0.13384228572249413, |
|
"learning_rate": 7.016504991533727e-07, |
|
"loss": 0.0054, |
|
"reward": 1.1666667014360428, |
|
"reward_std": 0.16661180183291435, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.27083333395421505, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.3958333395421505, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 196 |
|
}, |
|
{ |
|
"completion_length": 78.83333587646484, |
|
"epoch": 0.105446273250368, |
|
"grad_norm": 1.2510522603988647, |
|
"kl": 0.18244327045977116, |
|
"learning_rate": 6.775784314464717e-07, |
|
"loss": 0.0073, |
|
"reward": 1.0781250298023224, |
|
"reward_std": 0.17936956882476807, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.10416666977107525, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4791666716337204, |
|
"rewards/xmlcount_reward_func": 0.4947916716337204, |
|
"step": 197 |
|
}, |
|
{ |
|
"completion_length": 201.83333492279053, |
|
"epoch": 0.10598153352067442, |
|
"grad_norm": 1.4353007078170776, |
|
"kl": 0.17845631763339043, |
|
"learning_rate": 6.538616484352902e-07, |
|
"loss": 0.0071, |
|
"reward": 1.4947916865348816, |
|
"reward_std": 0.45137757435441017, |
|
"rewards/correctness_reward_func": 0.3333333432674408, |
|
"rewards/int_reward_func": 0.33333334140479565, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.375, |
|
"rewards/xmlcount_reward_func": 0.453125, |
|
"step": 198 |
|
}, |
|
{ |
|
"completion_length": 315.6666736602783, |
|
"epoch": 0.10651679379098086, |
|
"grad_norm": 0.7709922790527344, |
|
"kl": 0.15007262770086527, |
|
"learning_rate": 6.305047737536707e-07, |
|
"loss": 0.006, |
|
"reward": 1.3333333730697632, |
|
"reward_std": 0.43266693875193596, |
|
"rewards/correctness_reward_func": 0.0833333358168602, |
|
"rewards/int_reward_func": 0.4166666679084301, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.3958333395421505, |
|
"rewards/xmlcount_reward_func": 0.4375, |
|
"step": 199 |
|
}, |
|
{ |
|
"completion_length": 226.25001335144043, |
|
"epoch": 0.1070520540612873, |
|
"grad_norm": 0.4058845043182373, |
|
"kl": 0.24160834029316902, |
|
"learning_rate": 6.075123608706093e-07, |
|
"loss": 0.0097, |
|
"reward": 1.6354166865348816, |
|
"reward_std": 0.6109069883823395, |
|
"rewards/correctness_reward_func": 0.3333333358168602, |
|
"rewards/int_reward_func": 0.3958333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4375, |
|
"rewards/xmlcount_reward_func": 0.46875, |
|
"step": 200 |
|
}, |
|
{ |
|
"completion_length": 78.66666793823242, |
|
"epoch": 0.10758731433159374, |
|
"grad_norm": 0.5436657071113586, |
|
"kl": 0.16075460240244865, |
|
"learning_rate": 5.848888922025553e-07, |
|
"loss": 0.0064, |
|
"reward": 1.4375, |
|
"reward_std": 0.22008520364761353, |
|
"rewards/correctness_reward_func": 0.0833333358168602, |
|
"rewards/int_reward_func": 0.3541666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.5, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 201 |
|
}, |
|
{ |
|
"completion_length": 82.58333396911621, |
|
"epoch": 0.10812257460190017, |
|
"grad_norm": 0.7532062530517578, |
|
"kl": 0.1826519127935171, |
|
"learning_rate": 5.626387782395512e-07, |
|
"loss": 0.0073, |
|
"reward": 1.7916666865348816, |
|
"reward_std": 0.20412413775920868, |
|
"rewards/correctness_reward_func": 0.4166666865348816, |
|
"rewards/int_reward_func": 0.375, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.5, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 202 |
|
}, |
|
{ |
|
"completion_length": 79.16666984558105, |
|
"epoch": 0.10865783487220661, |
|
"grad_norm": 1.078783631324768, |
|
"kl": 0.1677638739347458, |
|
"learning_rate": 5.407663566854008e-07, |
|
"loss": 0.0067, |
|
"reward": 1.4375000298023224, |
|
"reward_std": 0.31970491632819176, |
|
"rewards/correctness_reward_func": 0.0833333358168602, |
|
"rewards/int_reward_func": 0.35416666977107525, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.5, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 203 |
|
}, |
|
{ |
|
"completion_length": 92.33333396911621, |
|
"epoch": 0.10919309514251305, |
|
"grad_norm": 0.8154336214065552, |
|
"kl": 0.18606754019856453, |
|
"learning_rate": 5.192758916120236e-07, |
|
"loss": 0.0074, |
|
"reward": 1.9375000298023224, |
|
"reward_std": 0.44672293961048126, |
|
"rewards/correctness_reward_func": 0.5000000223517418, |
|
"rewards/int_reward_func": 0.4375, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.5, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 204 |
|
}, |
|
{ |
|
"completion_length": 215.95834159851074, |
|
"epoch": 0.10972835541281949, |
|
"grad_norm": 0.8592817783355713, |
|
"kl": 0.15069226268678904, |
|
"learning_rate": 4.981715726281666e-07, |
|
"loss": 0.006, |
|
"reward": 1.2552083730697632, |
|
"reward_std": 0.21166006475687027, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.3958333432674408, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.39583333395421505, |
|
"rewards/xmlcount_reward_func": 0.4635416716337204, |
|
"step": 205 |
|
}, |
|
{ |
|
"completion_length": 160.62500381469727, |
|
"epoch": 0.11026361568312593, |
|
"grad_norm": 1.0854992866516113, |
|
"kl": 0.1672863345593214, |
|
"learning_rate": 4.774575140626317e-07, |
|
"loss": 0.0067, |
|
"reward": 1.3802083730697632, |
|
"reward_std": 0.4717924892902374, |
|
"rewards/correctness_reward_func": 0.0833333358168602, |
|
"rewards/int_reward_func": 0.4166666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.3958333358168602, |
|
"rewards/xmlcount_reward_func": 0.484375, |
|
"step": 206 |
|
}, |
|
{ |
|
"completion_length": 85.62500190734863, |
|
"epoch": 0.11079887595343235, |
|
"grad_norm": 2.937671184539795, |
|
"kl": 0.22150231339037418, |
|
"learning_rate": 4.5713775416217884e-07, |
|
"loss": 0.0089, |
|
"reward": 1.1458333730697632, |
|
"reward_std": 0.3955717794597149, |
|
"rewards/correctness_reward_func": 0.0833333358168602, |
|
"rewards/int_reward_func": 0.18750000186264515, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.3750000074505806, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 207 |
|
}, |
|
{ |
|
"completion_length": 86.54166889190674, |
|
"epoch": 0.11133413622373879, |
|
"grad_norm": 0.791983962059021, |
|
"kl": 0.2634577229619026, |
|
"learning_rate": 4.372162543042624e-07, |
|
"loss": 0.0105, |
|
"reward": 1.3541667014360428, |
|
"reward_std": 0.31970490142703056, |
|
"rewards/correctness_reward_func": 0.0833333358168602, |
|
"rewards/int_reward_func": 0.3541666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4166666679084301, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 208 |
|
}, |
|
{ |
|
"completion_length": 103.04166984558105, |
|
"epoch": 0.11186939649404523, |
|
"grad_norm": 0.7941485643386841, |
|
"kl": 0.15498985722661018, |
|
"learning_rate": 4.1769689822475147e-07, |
|
"loss": 0.0062, |
|
"reward": 1.2343750298023224, |
|
"reward_std": 0.3260645717382431, |
|
"rewards/correctness_reward_func": 0.0833333358168602, |
|
"rewards/int_reward_func": 0.4375, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.25000000186264515, |
|
"rewards/xmlcount_reward_func": 0.4635416716337204, |
|
"step": 209 |
|
}, |
|
{ |
|
"completion_length": 126.79166984558105, |
|
"epoch": 0.11240465676435167, |
|
"grad_norm": 1.0149474143981934, |
|
"kl": 0.12467027455568314, |
|
"learning_rate": 3.9858349126078945e-07, |
|
"loss": 0.005, |
|
"reward": 1.7271667420864105, |
|
"reward_std": 0.5239234380424023, |
|
"rewards/correctness_reward_func": 0.4166666865348816, |
|
"rewards/int_reward_func": 0.4166666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.416666679084301, |
|
"rewards/xmlcount_reward_func": 0.47716666758060455, |
|
"step": 210 |
|
}, |
|
{ |
|
"completion_length": 85.79166984558105, |
|
"epoch": 0.1129399170346581, |
|
"grad_norm": 0.8396437764167786, |
|
"kl": 0.18657264113426208, |
|
"learning_rate": 3.798797596089351e-07, |
|
"loss": 0.0075, |
|
"reward": 1.5000000298023224, |
|
"reward_std": 0.32274864614009857, |
|
"rewards/correctness_reward_func": 0.1666666716337204, |
|
"rewards/int_reward_func": 0.3333333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.5, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 211 |
|
}, |
|
{ |
|
"completion_length": 108.00000381469727, |
|
"epoch": 0.11347517730496454, |
|
"grad_norm": 0.934079110622406, |
|
"kl": 0.1696683205664158, |
|
"learning_rate": 3.615893495987335e-07, |
|
"loss": 0.0068, |
|
"reward": 1.8125000298023224, |
|
"reward_std": 0.6319277845323086, |
|
"rewards/correctness_reward_func": 0.5000000149011612, |
|
"rewards/int_reward_func": 0.3958333395421505, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.416666679084301, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 212 |
|
}, |
|
{ |
|
"completion_length": 193.0416669845581, |
|
"epoch": 0.11401043757527098, |
|
"grad_norm": 0.6677061319351196, |
|
"kl": 0.1520245149731636, |
|
"learning_rate": 3.4371582698185636e-07, |
|
"loss": 0.0061, |
|
"reward": 1.713541716337204, |
|
"reward_std": 0.5496542304754257, |
|
"rewards/correctness_reward_func": 0.583333358168602, |
|
"rewards/int_reward_func": 0.3333333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.3333333395421505, |
|
"rewards/xmlcount_reward_func": 0.4635416716337204, |
|
"step": 213 |
|
}, |
|
{ |
|
"completion_length": 86.66666984558105, |
|
"epoch": 0.11454569784557742, |
|
"grad_norm": 0.4593207538127899, |
|
"kl": 0.1518435962498188, |
|
"learning_rate": 3.262626762369525e-07, |
|
"loss": 0.0061, |
|
"reward": 1.0833333432674408, |
|
"reward_std": 0.23273734748363495, |
|
"rewards/correctness_reward_func": 0.0833333358168602, |
|
"rewards/int_reward_func": 0.0625, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4375, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 214 |
|
}, |
|
{ |
|
"completion_length": 104.45833778381348, |
|
"epoch": 0.11508095811588386, |
|
"grad_norm": 0.42647111415863037, |
|
"kl": 0.21028569713234901, |
|
"learning_rate": 3.092332998903416e-07, |
|
"loss": 0.0084, |
|
"reward": 1.6458333730697632, |
|
"reward_std": 0.2753772810101509, |
|
"rewards/correctness_reward_func": 0.4166666865348816, |
|
"rewards/int_reward_func": 0.4166666679084301, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.3333333358168602, |
|
"rewards/xmlcount_reward_func": 0.4791666716337204, |
|
"step": 215 |
|
}, |
|
{ |
|
"completion_length": 325.2500114440918, |
|
"epoch": 0.11561621838619028, |
|
"grad_norm": 0.5365056991577148, |
|
"kl": 0.134639460593462, |
|
"learning_rate": 2.9263101785268253e-07, |
|
"loss": 0.0054, |
|
"reward": 1.604166679084301, |
|
"reward_std": 0.14088044688105583, |
|
"rewards/correctness_reward_func": 0.5, |
|
"rewards/int_reward_func": 0.2916666679084301, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.375, |
|
"rewards/xmlcount_reward_func": 0.4375, |
|
"step": 216 |
|
}, |
|
{ |
|
"completion_length": 89.54166793823242, |
|
"epoch": 0.11615147865649672, |
|
"grad_norm": 0.6397629976272583, |
|
"kl": 0.21499066427350044, |
|
"learning_rate": 2.764590667717562e-07, |
|
"loss": 0.0086, |
|
"reward": 1.5, |
|
"reward_std": 0.19364917278289795, |
|
"rewards/correctness_reward_func": 0.0833333358168602, |
|
"rewards/int_reward_func": 0.4583333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4583333358168602, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 217 |
|
}, |
|
{ |
|
"completion_length": 85.50000095367432, |
|
"epoch": 0.11668673892680316, |
|
"grad_norm": 1.3944755792617798, |
|
"kl": 0.13307987339794636, |
|
"learning_rate": 2.6072059940146775e-07, |
|
"loss": 0.0053, |
|
"reward": 1.541666716337204, |
|
"reward_std": 0.5103103779256344, |
|
"rewards/correctness_reward_func": 0.1666666716337204, |
|
"rewards/int_reward_func": 0.39583333395421505, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4791666716337204, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 218 |
|
}, |
|
{ |
|
"completion_length": 80.66666793823242, |
|
"epoch": 0.1172219991971096, |
|
"grad_norm": 0.08477512001991272, |
|
"kl": 0.16065896674990654, |
|
"learning_rate": 2.454186839872158e-07, |
|
"loss": 0.0064, |
|
"reward": 1.125, |
|
"reward_std": 0.0, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.125, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.5, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 219 |
|
}, |
|
{ |
|
"completion_length": 72.79166793823242, |
|
"epoch": 0.11775725946741603, |
|
"grad_norm": 0.6473060250282288, |
|
"kl": 0.2047443389892578, |
|
"learning_rate": 2.3055630366772857e-07, |
|
"loss": 0.0082, |
|
"reward": 1.6666666865348816, |
|
"reward_std": 0.20412413775920868, |
|
"rewards/correctness_reward_func": 0.4166666865348816, |
|
"rewards/int_reward_func": 0.25, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.5, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 220 |
|
}, |
|
{ |
|
"completion_length": 255.87500381469727, |
|
"epoch": 0.11829251973772247, |
|
"grad_norm": 1.165757656097412, |
|
"kl": 0.34885890036821365, |
|
"learning_rate": 2.1613635589349756e-07, |
|
"loss": 0.014, |
|
"reward": 1.2083333432674408, |
|
"reward_std": 0.5354874432086945, |
|
"rewards/correctness_reward_func": 0.0833333358168602, |
|
"rewards/int_reward_func": 0.3125, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.3750000074505806, |
|
"rewards/xmlcount_reward_func": 0.4375, |
|
"step": 221 |
|
}, |
|
{ |
|
"completion_length": 68.04166889190674, |
|
"epoch": 0.1188277800080289, |
|
"grad_norm": 1.5734628438949585, |
|
"kl": 0.35767246037721634, |
|
"learning_rate": 2.0216165186191406e-07, |
|
"loss": 0.0143, |
|
"reward": 1.729166716337204, |
|
"reward_std": 0.4259376786649227, |
|
"rewards/correctness_reward_func": 0.5000000223517418, |
|
"rewards/int_reward_func": 0.25000000558793545, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4791666716337204, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 222 |
|
}, |
|
{ |
|
"completion_length": 93.95833587646484, |
|
"epoch": 0.11936304027833534, |
|
"grad_norm": 0.3049551248550415, |
|
"kl": 0.14186285808682442, |
|
"learning_rate": 1.8863491596921745e-07, |
|
"loss": 0.0057, |
|
"reward": 1.375, |
|
"reward_std": 0.25, |
|
"rewards/correctness_reward_func": 0.0833333358168602, |
|
"rewards/int_reward_func": 0.375, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4166666679084301, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 223 |
|
}, |
|
{ |
|
"completion_length": 170.7500057220459, |
|
"epoch": 0.11989830054864177, |
|
"grad_norm": 0.8757752180099487, |
|
"kl": 0.17503754422068596, |
|
"learning_rate": 1.7555878527937164e-07, |
|
"loss": 0.007, |
|
"reward": 1.1865417063236237, |
|
"reward_std": 0.413798563182354, |
|
"rewards/correctness_reward_func": 0.0833333358168602, |
|
"rewards/int_reward_func": 0.2291666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.3958333358168602, |
|
"rewards/xmlcount_reward_func": 0.4782083332538605, |
|
"step": 224 |
|
}, |
|
{ |
|
"completion_length": 191.8333396911621, |
|
"epoch": 0.12043356081894821, |
|
"grad_norm": 1.3906254768371582, |
|
"kl": 0.11161109246313572, |
|
"learning_rate": 1.629358090099639e-07, |
|
"loss": 0.0045, |
|
"reward": 0.96875, |
|
"reward_std": 0.29255440831184387, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.1875000074505806, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.3125000037252903, |
|
"rewards/xmlcount_reward_func": 0.46875, |
|
"step": 225 |
|
}, |
|
{ |
|
"completion_length": 150.75000762939453, |
|
"epoch": 0.12096882108925465, |
|
"grad_norm": 0.7032321691513062, |
|
"kl": 0.23093389347195625, |
|
"learning_rate": 1.507684480352292e-07, |
|
"loss": 0.0092, |
|
"reward": 1.651041716337204, |
|
"reward_std": 0.36715345084667206, |
|
"rewards/correctness_reward_func": 0.3333333432674408, |
|
"rewards/int_reward_func": 0.4375000074505806, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.39583333395421505, |
|
"rewards/xmlcount_reward_func": 0.484375, |
|
"step": 226 |
|
}, |
|
{ |
|
"completion_length": 173.04166793823242, |
|
"epoch": 0.12150408135956109, |
|
"grad_norm": 0.777036726474762, |
|
"kl": 0.22725828364491463, |
|
"learning_rate": 1.3905907440629752e-07, |
|
"loss": 0.0091, |
|
"reward": 1.333333358168602, |
|
"reward_std": 0.3425312591716647, |
|
"rewards/correctness_reward_func": 0.0833333358168602, |
|
"rewards/int_reward_func": 0.3750000037252903, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.3958333358168602, |
|
"rewards/xmlcount_reward_func": 0.4791666716337204, |
|
"step": 227 |
|
}, |
|
{ |
|
"completion_length": 114.66667079925537, |
|
"epoch": 0.12203934162986753, |
|
"grad_norm": 2.417041778564453, |
|
"kl": 0.15211265347898006, |
|
"learning_rate": 1.278099708887587e-07, |
|
"loss": 0.0061, |
|
"reward": 2.067708373069763, |
|
"reward_std": 0.8522266149520874, |
|
"rewards/correctness_reward_func": 0.6666666716337204, |
|
"rewards/int_reward_func": 0.4791666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4375, |
|
"rewards/xmlcount_reward_func": 0.484375, |
|
"step": 228 |
|
}, |
|
{ |
|
"completion_length": 71.45833587646484, |
|
"epoch": 0.12257460190017395, |
|
"grad_norm": 1.4324641227722168, |
|
"kl": 0.2629435919225216, |
|
"learning_rate": 1.1702333051763271e-07, |
|
"loss": 0.0105, |
|
"reward": 1.4375000149011612, |
|
"reward_std": 0.4443886801600456, |
|
"rewards/correctness_reward_func": 0.25, |
|
"rewards/int_reward_func": 0.20833333395421505, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4791666716337204, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 229 |
|
}, |
|
{ |
|
"completion_length": 265.95834159851074, |
|
"epoch": 0.1231098621704804, |
|
"grad_norm": 0.49865278601646423, |
|
"kl": 0.08768011070787907, |
|
"learning_rate": 1.067012561698319e-07, |
|
"loss": 0.0035, |
|
"reward": 0.8750000149011612, |
|
"reward_std": 0.42222120985388756, |
|
"rewards/correctness_reward_func": 0.0833333358168602, |
|
"rewards/int_reward_func": 0.02083333395421505, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.3333333432674408, |
|
"rewards/xmlcount_reward_func": 0.4375, |
|
"step": 230 |
|
}, |
|
{ |
|
"completion_length": 365.291672706604, |
|
"epoch": 0.12364512244078683, |
|
"grad_norm": 0.5076226592063904, |
|
"kl": 0.13591468706727028, |
|
"learning_rate": 9.684576015420277e-08, |
|
"loss": 0.0054, |
|
"reward": 1.083333358168602, |
|
"reward_std": 0.49768130481243134, |
|
"rewards/correctness_reward_func": 0.1666666716337204, |
|
"rewards/int_reward_func": 0.25, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.27083333395421505, |
|
"rewards/xmlcount_reward_func": 0.3958333358168602, |
|
"step": 231 |
|
}, |
|
{ |
|
"completion_length": 95.41666793823242, |
|
"epoch": 0.12418038271109327, |
|
"grad_norm": 1.2022953033447266, |
|
"kl": 0.14907664991915226, |
|
"learning_rate": 8.745876381922147e-08, |
|
"loss": 0.006, |
|
"reward": 1.3697917461395264, |
|
"reward_std": 0.4918174706399441, |
|
"rewards/correctness_reward_func": 0.1666666716337204, |
|
"rewards/int_reward_func": 0.31250000186264515, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4166666716337204, |
|
"rewards/xmlcount_reward_func": 0.4739583358168602, |
|
"step": 232 |
|
}, |
|
{ |
|
"completion_length": 113.79166793823242, |
|
"epoch": 0.1247156429813997, |
|
"grad_norm": 0.5673840641975403, |
|
"kl": 0.26192033290863037, |
|
"learning_rate": 7.854209717842231e-08, |
|
"loss": 0.0105, |
|
"reward": 1.3125000149011612, |
|
"reward_std": 0.15864631533622742, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.37500000558793545, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4375000074505806, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 233 |
|
}, |
|
{ |
|
"completion_length": 112.12500190734863, |
|
"epoch": 0.12525090325170615, |
|
"grad_norm": 1.3854151964187622, |
|
"kl": 0.17152241989970207, |
|
"learning_rate": 7.009749855363457e-08, |
|
"loss": 0.0069, |
|
"reward": 1.4375000596046448, |
|
"reward_std": 0.4543575756251812, |
|
"rewards/correctness_reward_func": 0.1666666716337204, |
|
"rewards/int_reward_func": 0.33333333395421505, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4375000074505806, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 234 |
|
}, |
|
{ |
|
"completion_length": 98.75000381469727, |
|
"epoch": 0.12578616352201258, |
|
"grad_norm": 0.790249764919281, |
|
"kl": 0.14792344719171524, |
|
"learning_rate": 6.212661423609184e-08, |
|
"loss": 0.0059, |
|
"reward": 1.7291666865348816, |
|
"reward_std": 0.5564306005835533, |
|
"rewards/correctness_reward_func": 0.3333333432674408, |
|
"rewards/int_reward_func": 0.4791666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4166666716337204, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 235 |
|
}, |
|
{ |
|
"completion_length": 144.8333396911621, |
|
"epoch": 0.126321423792319, |
|
"grad_norm": 0.2076501101255417, |
|
"kl": 0.18184524960815907, |
|
"learning_rate": 5.463099816548578e-08, |
|
"loss": 0.0073, |
|
"reward": 1.4479166865348816, |
|
"reward_std": 0.2685803771018982, |
|
"rewards/correctness_reward_func": 0.0833333358168602, |
|
"rewards/int_reward_func": 0.4583333358168602, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4375, |
|
"rewards/xmlcount_reward_func": 0.46875, |
|
"step": 236 |
|
}, |
|
{ |
|
"completion_length": 142.12500190734863, |
|
"epoch": 0.12685668406262546, |
|
"grad_norm": 1.9539458751678467, |
|
"kl": 0.24447984993457794, |
|
"learning_rate": 4.761211162702117e-08, |
|
"loss": 0.0098, |
|
"reward": 1.2968750596046448, |
|
"reward_std": 0.29226116091012955, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.4791666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.3333333469927311, |
|
"rewards/xmlcount_reward_func": 0.484375, |
|
"step": 237 |
|
}, |
|
{ |
|
"completion_length": 72.37500381469727, |
|
"epoch": 0.12739194433293188, |
|
"grad_norm": 1.3912091255187988, |
|
"kl": 0.29350727051496506, |
|
"learning_rate": 4.1071322966535487e-08, |
|
"loss": 0.0117, |
|
"reward": 1.291666716337204, |
|
"reward_std": 0.2096773497760296, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.3125000074505806, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4791666716337204, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 238 |
|
}, |
|
{ |
|
"completion_length": 82.41667175292969, |
|
"epoch": 0.12792720460323834, |
|
"grad_norm": 0.43811848759651184, |
|
"kl": 0.202481709420681, |
|
"learning_rate": 3.5009907323737826e-08, |
|
"loss": 0.0081, |
|
"reward": 1.2083333432674408, |
|
"reward_std": 0.06454972922801971, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.2291666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4791666716337204, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 239 |
|
}, |
|
{ |
|
"completion_length": 85.45833587646484, |
|
"epoch": 0.12846246487354476, |
|
"grad_norm": 0.8119534850120544, |
|
"kl": 0.23333512246608734, |
|
"learning_rate": 2.9429046383618042e-08, |
|
"loss": 0.0093, |
|
"reward": 1.1666666865348816, |
|
"reward_std": 0.11949635669589043, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.1666666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.5, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 240 |
|
}, |
|
{ |
|
"completion_length": 160.91666984558105, |
|
"epoch": 0.1289977251438512, |
|
"grad_norm": 1.1854231357574463, |
|
"kl": 0.15807193890213966, |
|
"learning_rate": 2.4329828146074096e-08, |
|
"loss": 0.0063, |
|
"reward": 0.8219999894499779, |
|
"reward_std": 0.6791375987231731, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.1875, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4375000074505806, |
|
"rewards/xmlcount_reward_func": 0.19699998199939728, |
|
"step": 241 |
|
}, |
|
{ |
|
"completion_length": 72.75000190734863, |
|
"epoch": 0.12953298541415764, |
|
"grad_norm": 0.45305031538009644, |
|
"kl": 0.20153097435832024, |
|
"learning_rate": 1.9713246713805588e-08, |
|
"loss": 0.0081, |
|
"reward": 1.3125, |
|
"reward_std": 0.22008520364761353, |
|
"rewards/correctness_reward_func": 0.0833333358168602, |
|
"rewards/int_reward_func": 0.2291666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.5, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 242 |
|
}, |
|
{ |
|
"completion_length": 152.1666717529297, |
|
"epoch": 0.13006824568446407, |
|
"grad_norm": 0.4157455861568451, |
|
"kl": 0.1709270216524601, |
|
"learning_rate": 1.5580202098509078e-08, |
|
"loss": 0.0068, |
|
"reward": 1.4427083432674408, |
|
"reward_std": 0.345734566450119, |
|
"rewards/correctness_reward_func": 0.0833333358168602, |
|
"rewards/int_reward_func": 0.4583333432674408, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4166666679084301, |
|
"rewards/xmlcount_reward_func": 0.484375, |
|
"step": 243 |
|
}, |
|
{ |
|
"completion_length": 72.79166889190674, |
|
"epoch": 0.1306035059547705, |
|
"grad_norm": 1.6172090768814087, |
|
"kl": 0.1881002075970173, |
|
"learning_rate": 1.193150004542204e-08, |
|
"loss": 0.0075, |
|
"reward": 1.4322916865348816, |
|
"reward_std": 0.26791293919086456, |
|
"rewards/correctness_reward_func": 0.0833333358168602, |
|
"rewards/int_reward_func": 0.375, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4791666716337204, |
|
"rewards/xmlcount_reward_func": 0.4947916716337204, |
|
"step": 244 |
|
}, |
|
{ |
|
"completion_length": 63.541666984558105, |
|
"epoch": 0.13113876622507695, |
|
"grad_norm": 0.6279511451721191, |
|
"kl": 0.26533466950058937, |
|
"learning_rate": 8.767851876239075e-09, |
|
"loss": 0.0106, |
|
"reward": 1.2708333432674408, |
|
"reward_std": 0.05103103816509247, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.27083333395421505, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.5, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 245 |
|
}, |
|
{ |
|
"completion_length": 188.20833587646484, |
|
"epoch": 0.13167402649538337, |
|
"grad_norm": 0.917432963848114, |
|
"kl": 0.16238786652684212, |
|
"learning_rate": 6.089874350439507e-09, |
|
"loss": 0.0065, |
|
"reward": 1.6145834028720856, |
|
"reward_std": 0.7706367075443268, |
|
"rewards/correctness_reward_func": 0.3333333432674408, |
|
"rewards/int_reward_func": 0.3958333432674408, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4166666716337204, |
|
"rewards/xmlcount_reward_func": 0.46875, |
|
"step": 246 |
|
}, |
|
{ |
|
"completion_length": 102.25000381469727, |
|
"epoch": 0.13220928676568983, |
|
"grad_norm": 0.6094866394996643, |
|
"kl": 0.1517469845712185, |
|
"learning_rate": 3.8980895450474455e-09, |
|
"loss": 0.0061, |
|
"reward": 1.2708333432674408, |
|
"reward_std": 0.27258947491645813, |
|
"rewards/correctness_reward_func": 0.0833333358168602, |
|
"rewards/int_reward_func": 0.2291666716337204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4583333358168602, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 247 |
|
}, |
|
{ |
|
"completion_length": 265.0000114440918, |
|
"epoch": 0.13274454703599625, |
|
"grad_norm": 1.4663077592849731, |
|
"kl": 0.1445157825946808, |
|
"learning_rate": 2.192924752854042e-09, |
|
"loss": 0.0058, |
|
"reward": 0.9218750074505806, |
|
"reward_std": 0.21220333129167557, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.18750000186264515, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.3125, |
|
"rewards/xmlcount_reward_func": 0.421875, |
|
"step": 248 |
|
}, |
|
{ |
|
"completion_length": 78.83333587646484, |
|
"epoch": 0.13327980730630268, |
|
"grad_norm": 1.0312261581420898, |
|
"kl": 0.16930826753377914, |
|
"learning_rate": 9.747123991141193e-10, |
|
"loss": 0.0068, |
|
"reward": 1.229166716337204, |
|
"reward_std": 0.1530931033194065, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/int_reward_func": 0.25000000558793545, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4791666716337204, |
|
"rewards/xmlcount_reward_func": 0.5, |
|
"step": 249 |
|
}, |
|
{ |
|
"completion_length": 271.9166793823242, |
|
"epoch": 0.13381506757660913, |
|
"grad_norm": 0.3518020808696747, |
|
"kl": 0.12262176536023617, |
|
"learning_rate": 2.43689976739403e-10, |
|
"loss": 0.0049, |
|
"reward": 1.5572916716337204, |
|
"reward_std": 0.25069504231214523, |
|
"rewards/correctness_reward_func": 0.5, |
|
"rewards/int_reward_func": 0.20833333395421505, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.3958333395421505, |
|
"rewards/xmlcount_reward_func": 0.453125, |
|
"step": 250 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 250, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 250, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 6, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|