BullUp's picture
End of training
fe9fc68 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 16.666666666666668,
"eval_steps": 500,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 405.25,
"epoch": 0.16666666666666666,
"grad_norm": 0.4627874195575714,
"kl": 0.005518825375474989,
"learning_rate": 8.333333333333333e-08,
"loss": 0.0,
"reward": 0.9609934687614441,
"reward_std": 0.9843197166919708,
"rewards/concensus_correctness_reward_func": 0.0403750017285347,
"rewards/consensus_reward_func": 0.25,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.48086845502257347,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.17412499990314245,
"step": 2
},
{
"completion_length": 456.9375,
"epoch": 0.3333333333333333,
"grad_norm": 0.34028512239456177,
"kl": 0.0030479243432637304,
"learning_rate": 2.5e-07,
"loss": 0.0,
"reward": 0.9757307097315788,
"reward_std": 1.0358425956219435,
"rewards/concensus_correctness_reward_func": 0.013249999843537807,
"rewards/consensus_reward_func": 0.3125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.42269944585859776,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.21165626123547554,
"step": 4
},
{
"completion_length": 541.40625,
"epoch": 0.5,
"grad_norm": 0.6734604835510254,
"kl": 0.0027404608263168484,
"learning_rate": 4.1666666666666667e-07,
"loss": 0.0,
"reward": 0.9920943900942802,
"reward_std": 0.8858779668807983,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.5916256122291088,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.03125,
"rewards/xmlcount_reward_func": 0.24421875784173608,
"step": 6
},
{
"completion_length": 510.90625,
"epoch": 0.6666666666666666,
"grad_norm": 0.38359829783439636,
"kl": 0.004760640629683621,
"learning_rate": 4.99967220916408e-07,
"loss": 0.0,
"reward": 1.3815101590007544,
"reward_std": 0.6916671060025692,
"rewards/concensus_correctness_reward_func": 0.06412499956786633,
"rewards/consensus_reward_func": 0.1875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0625,
"rewards/question_recreation_reward_func": 0.6536664366722107,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.3668437581509352,
"step": 8
},
{
"completion_length": 425.09375,
"epoch": 0.8333333333333334,
"grad_norm": 0.5544623732566833,
"kl": 0.0043694232881534845,
"learning_rate": 4.997050398198976e-07,
"loss": 0.0,
"reward": 1.6439796090126038,
"reward_std": 0.8410376366227865,
"rewards/concensus_correctness_reward_func": 0.0768750011920929,
"rewards/consensus_reward_func": 0.9375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.5118233747780323,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.10215624794363976,
"step": 10
},
{
"completion_length": 424.5,
"epoch": 1.0,
"grad_norm": 0.3566124141216278,
"kl": 0.0029268394282553345,
"learning_rate": 4.991809526186423e-07,
"loss": 0.0,
"reward": 0.7868102770298719,
"reward_std": 0.9274830296635628,
"rewards/concensus_correctness_reward_func": 0.04543749988079071,
"rewards/consensus_reward_func": 0.0625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.1875,
"rewards/question_recreation_reward_func": 0.5272165313363075,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.03584375884383917,
"step": 12
},
{
"completion_length": 455.28125,
"epoch": 1.1666666666666667,
"grad_norm": 0.4540335536003113,
"kl": 0.0037232608010526747,
"learning_rate": 4.983955090077444e-07,
"loss": 0.0,
"reward": 0.9765743091702461,
"reward_std": 1.24457941763103,
"rewards/concensus_correctness_reward_func": 0.014875000342726707,
"rewards/consensus_reward_func": 0.375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.4753867909312248,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.09568749740719795,
"step": 14
},
{
"completion_length": 377.21875,
"epoch": 1.3333333333333333,
"grad_norm": 1.5153367519378662,
"kl": 0.0050389311800245196,
"learning_rate": 4.973495328090889e-07,
"loss": 0.0,
"reward": 1.0638171210885048,
"reward_std": 1.0938183590769768,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.1875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0625,
"rewards/question_recreation_reward_func": 0.5703795924782753,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.1965624913573265,
"step": 16
},
{
"completion_length": 472.5,
"epoch": 1.5,
"grad_norm": 0.3531189560890198,
"kl": 0.0040535782754886895,
"learning_rate": 4.960441211072685e-07,
"loss": 0.0,
"reward": 1.1587776821106672,
"reward_std": 1.1633746810257435,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.5,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0625,
"rewards/question_recreation_reward_func": 0.5646526888012886,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.03162500262260437,
"step": 18
},
{
"completion_length": 432.9375,
"epoch": 1.6666666666666665,
"grad_norm": 0.42279884219169617,
"kl": 0.0036564485053531826,
"learning_rate": 4.944806430988927e-07,
"loss": 0.0,
"reward": 0.8500742139294744,
"reward_std": 1.044841218739748,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.4375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.4372929581440985,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": -0.040343739092350006,
"step": 20
},
{
"completion_length": 503.65625,
"epoch": 1.8333333333333335,
"grad_norm": 0.35253623127937317,
"kl": 0.002885653229895979,
"learning_rate": 4.926607386564898e-07,
"loss": 0.0,
"reward": 0.9196254201233387,
"reward_std": 1.1270109415054321,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.25,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.6083129234611988,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.061312501318752766,
"step": 22
},
{
"completion_length": 390.3125,
"epoch": 2.0,
"grad_norm": 0.7617405652999878,
"kl": 0.0034773742081597447,
"learning_rate": 4.905863166085075e-07,
"loss": 0.0,
"reward": 0.9421910382807255,
"reward_std": 1.0266394726932049,
"rewards/concensus_correctness_reward_func": 0.0403750017285347,
"rewards/consensus_reward_func": 0.1875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.5005660317838192,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.21375000849366188,
"step": 24
},
{
"completion_length": 446.375,
"epoch": 2.1666666666666665,
"grad_norm": 0.539344310760498,
"kl": 0.0038461567601189017,
"learning_rate": 4.882595527372152e-07,
"loss": 0.0,
"reward": 0.990186957642436,
"reward_std": 1.246212176978588,
"rewards/concensus_correctness_reward_func": 0.025624999776482582,
"rewards/consensus_reward_func": 0.375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0625,
"rewards/question_recreation_reward_func": 0.39828070998191833,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.12878123950213194,
"step": 26
},
{
"completion_length": 484.21875,
"epoch": 2.3333333333333335,
"grad_norm": 0.573151171207428,
"kl": 0.003478569706203416,
"learning_rate": 4.856828874966086e-07,
"loss": 0.0,
"reward": 1.1689446438103914,
"reward_std": 1.0504138097167015,
"rewards/concensus_correctness_reward_func": 0.02199999988079071,
"rewards/consensus_reward_func": 0.4375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.656163401901722,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.0376562625169754,
"step": 28
},
{
"completion_length": 420.25,
"epoch": 2.5,
"grad_norm": 0.5913636088371277,
"kl": 0.004606734742992558,
"learning_rate": 4.828590234527106e-07,
"loss": 0.0,
"reward": 1.556147813796997,
"reward_std": 2.1363642998039722,
"rewards/concensus_correctness_reward_func": 0.625,
"rewards/consensus_reward_func": 0.3125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.3810540623962879,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.221968749538064,
"step": 30
},
{
"completion_length": 428.28125,
"epoch": 2.6666666666666665,
"grad_norm": 0.4051259756088257,
"kl": 0.003205618428182788,
"learning_rate": 4.79790922448953e-07,
"loss": 0.0,
"reward": 1.021159190684557,
"reward_std": 0.932405311614275,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.25,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.5468154139816761,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.20871875435113907,
"step": 32
},
{
"completion_length": 477.0,
"epoch": 2.8333333333333335,
"grad_norm": 1.2722704410552979,
"kl": 0.003231980503187515,
"learning_rate": 4.7648180249961165e-07,
"loss": 0.0,
"reward": 1.096991840749979,
"reward_std": 1.2813670113682747,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.5021168403327465,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.03125,
"rewards/xmlcount_reward_func": 0.18862500227987766,
"step": 34
},
{
"completion_length": 517.65625,
"epoch": 3.0,
"grad_norm": 0.5277615785598755,
"kl": 0.003874574991641566,
"learning_rate": 4.7293513441455357e-07,
"loss": 0.0,
"reward": 0.7769100246950984,
"reward_std": 1.269415270537138,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.25,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.5426287725567818,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.031343746930360794,
"step": 36
},
{
"completion_length": 507.5,
"epoch": 3.1666666666666665,
"grad_norm": 0.4287290871143341,
"kl": 0.0033028597244992852,
"learning_rate": 4.691546381588369e-07,
"loss": 0.0,
"reward": 0.550812273286283,
"reward_std": 1.1209257319569588,
"rewards/concensus_correctness_reward_func": 0.02306249924004078,
"rewards/consensus_reward_func": 0.25,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.3885622890666127,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.11081249732524157,
"step": 38
},
{
"completion_length": 508.34375,
"epoch": 3.3333333333333335,
"grad_norm": 0.40618908405303955,
"kl": 0.0037306795711629093,
"learning_rate": 4.651442789509813e-07,
"loss": 0.0,
"reward": 0.7808872517198324,
"reward_std": 0.8644677195698023,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.6908872574567795,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.034999994561076164,
"step": 40
},
{
"completion_length": 409.46875,
"epoch": 3.5,
"grad_norm": 0.39888831973075867,
"kl": 0.004624970955774188,
"learning_rate": 4.609082631040011e-07,
"loss": 0.0,
"reward": 1.320845801383257,
"reward_std": 1.0828375816345215,
"rewards/concensus_correctness_reward_func": 0.07012500241398811,
"rewards/consensus_reward_func": 0.375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.5185021087527275,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.03125,
"rewards/xmlcount_reward_func": 0.3259687628597021,
"step": 42
},
{
"completion_length": 454.125,
"epoch": 3.6666666666666665,
"grad_norm": 5.556149959564209,
"kl": 0.003802574341534637,
"learning_rate": 4.5645103361356407e-07,
"loss": 0.0,
"reward": 0.8918353654444218,
"reward_std": 1.0566863603889942,
"rewards/concensus_correctness_reward_func": 0.025624999776482582,
"rewards/consensus_reward_func": 0.25,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.5350853689014912,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.08112499676644802,
"step": 44
},
{
"completion_length": 457.34375,
"epoch": 3.8333333333333335,
"grad_norm": 0.35926172137260437,
"kl": 0.003958026180043817,
"learning_rate": 4.517772654979023e-07,
"loss": 0.0,
"reward": 0.9805956035852432,
"reward_std": 0.7859199158847332,
"rewards/concensus_correctness_reward_func": 0.0260624997317791,
"rewards/consensus_reward_func": 0.125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.5270956140011549,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.30243751779198647,
"step": 46
},
{
"completion_length": 429.90625,
"epoch": 4.0,
"grad_norm": 0.5926030874252319,
"kl": 0.005827751272590831,
"learning_rate": 4.468918608943636e-07,
"loss": 0.0,
"reward": 1.7501742951571941,
"reward_std": 2.434488591738045,
"rewards/concensus_correctness_reward_func": 0.625,
"rewards/consensus_reward_func": 0.375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0625,
"rewards/question_recreation_reward_func": 0.38520555198192596,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.25559374690055847,
"step": 48
},
{
"completion_length": 463.65625,
"epoch": 4.166666666666667,
"grad_norm": 0.7409794330596924,
"kl": 0.00406863066018559,
"learning_rate": 4.417999439177465e-07,
"loss": 0.0,
"reward": 1.2483692020177841,
"reward_std": 0.8449539989233017,
"rewards/concensus_correctness_reward_func": 0.0403750017285347,
"rewards/consensus_reward_func": 0.3125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0625,
"rewards/question_recreation_reward_func": 0.359212932176888,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.4269062578678131,
"step": 50
},
{
"completion_length": 437.71875,
"epoch": 4.333333333333333,
"grad_norm": 0.39966848492622375,
"kl": 0.0033464983571320772,
"learning_rate": 4.365068552858115e-07,
"loss": 0.0,
"reward": 1.5179361971095204,
"reward_std": 1.1835376657545567,
"rewards/concensus_correctness_reward_func": 0.02199999988079071,
"rewards/consensus_reward_func": 0.625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.5100300014019012,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.3452812507748604,
"step": 52
},
{
"completion_length": 402.59375,
"epoch": 4.5,
"grad_norm": 0.5886970162391663,
"kl": 0.004897425736999139,
"learning_rate": 4.310181467176054e-07,
"loss": 0.0,
"reward": 1.4094564961269498,
"reward_std": 0.9502353556454182,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.6875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.5278940042480826,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.178437490016222,
"step": 54
},
{
"completion_length": 432.09375,
"epoch": 4.666666666666667,
"grad_norm": 0.41223806142807007,
"kl": 0.004628591472283006,
"learning_rate": 4.253395751104748e-07,
"loss": 0.0,
"reward": 2.7664652466773987,
"reward_std": 2.371488120406866,
"rewards/concensus_correctness_reward_func": 1.2650624997913837,
"rewards/consensus_reward_func": 0.5625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0625,
"rewards/question_recreation_reward_func": 0.6736840140074492,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.20271877851337194,
"step": 56
},
{
"completion_length": 458.21875,
"epoch": 4.833333333333333,
"grad_norm": 1.161920428276062,
"kl": 0.004868229589192197,
"learning_rate": 4.194770965018758e-07,
"loss": 0.0,
"reward": 0.8611085470765829,
"reward_std": 0.9239636212587357,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.25,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.4858272895216942,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.03125,
"rewards/xmlcount_reward_func": 0.0940312510356307,
"step": 58
},
{
"completion_length": 575.1875,
"epoch": 5.0,
"grad_norm": 0.4627440869808197,
"kl": 0.0028841852181358263,
"learning_rate": 4.1343685982231315e-07,
"loss": 0.0,
"reward": 0.7771566994488239,
"reward_std": 0.9787662029266357,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.6247504502534866,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.03125,
"rewards/xmlcount_reward_func": -0.0038437489420175552,
"step": 60
},
{
"completion_length": 452.625,
"epoch": 5.166666666666667,
"grad_norm": 0.372567355632782,
"kl": 0.0034624057734617963,
"learning_rate": 4.072252004459611e-07,
"loss": 0.0,
"reward": 0.7446466162800789,
"reward_std": 0.7917606569826603,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.0625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.5209591202437878,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.14556249883025885,
"step": 62
},
{
"completion_length": 462.46875,
"epoch": 5.333333333333333,
"grad_norm": 0.855252742767334,
"kl": 0.005477891769260168,
"learning_rate": 4.0084863354573116e-07,
"loss": 0.0,
"reward": 1.2447932958602905,
"reward_std": 1.16534423828125,
"rewards/concensus_correctness_reward_func": 0.04312499985098839,
"rewards/consensus_reward_func": 0.4375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.6511683035641909,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0973750026896596,
"step": 64
},
{
"completion_length": 552.125,
"epoch": 5.5,
"grad_norm": 0.7227801084518433,
"kl": 0.0039346517296507955,
"learning_rate": 3.9431384725975485e-07,
"loss": 0.0,
"reward": 1.028742030262947,
"reward_std": 1.049393068999052,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.25,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.6304295305162668,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.03125,
"rewards/xmlcount_reward_func": 0.11706250137649477,
"step": 66
},
{
"completion_length": 403.0,
"epoch": 5.666666666666667,
"grad_norm": 0.4475947320461273,
"kl": 0.003715590573847294,
"learning_rate": 3.876276956764509e-07,
"loss": 0.0,
"reward": 0.9110647588968277,
"reward_std": 0.9707382656633854,
"rewards/concensus_correctness_reward_func": 0.0403750017285347,
"rewards/consensus_reward_func": 0.25,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.5140022300183773,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.03125,
"rewards/xmlcount_reward_func": 0.07543750200420618,
"step": 68
},
{
"completion_length": 445.28125,
"epoch": 5.833333333333333,
"grad_norm": 0.48278599977493286,
"kl": 0.003939619520679116,
"learning_rate": 3.807971916455325e-07,
"loss": 0.0,
"reward": 1.055080994963646,
"reward_std": 1.2731535732746124,
"rewards/concensus_correctness_reward_func": 0.04543749988079071,
"rewards/consensus_reward_func": 0.3125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 0.5798934735357761,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.007750006392598152,
"step": 70
},
{
"completion_length": 514.09375,
"epoch": 6.0,
"grad_norm": 0.5504283308982849,
"kl": 0.0036322681698948145,
"learning_rate": 3.738294994224969e-07,
"loss": 0.0,
"reward": 0.7345115430653095,
"reward_std": 0.9186943359673023,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.25,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.35060528852045536,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.11828124895691872,
"step": 72
},
{
"completion_length": 381.0625,
"epoch": 6.166666666666667,
"grad_norm": 0.5463396310806274,
"kl": 0.004689242952736095,
"learning_rate": 3.6673192715431014e-07,
"loss": 0.0,
"reward": 1.040903776884079,
"reward_std": 0.8909546323120594,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.3645600201562047,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.2857187483459711,
"step": 74
},
{
"completion_length": 461.90625,
"epoch": 6.333333333333333,
"grad_norm": 0.5456697344779968,
"kl": 0.003803928440902382,
"learning_rate": 3.595119192141706e-07,
"loss": 0.0,
"reward": 1.3395836614072323,
"reward_std": 1.3747756779193878,
"rewards/concensus_correctness_reward_func": 0.07437499985098839,
"rewards/consensus_reward_func": 0.375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.5419899076223373,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.03125,
"rewards/xmlcount_reward_func": 0.3169687418267131,
"step": 76
},
{
"completion_length": 491.09375,
"epoch": 6.5,
"grad_norm": 0.4154431223869324,
"kl": 0.004837977059651166,
"learning_rate": 3.5217704839338905e-07,
"loss": 0.0,
"reward": 0.9017609618604183,
"reward_std": 1.2408568933606148,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.519823431968689,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": -0.008687485009431839,
"step": 78
},
{
"completion_length": 459.5625,
"epoch": 6.666666666666667,
"grad_norm": 0.4849317669868469,
"kl": 0.003633130807429552,
"learning_rate": 3.447350079585767e-07,
"loss": 0.0,
"reward": 1.0950381644070148,
"reward_std": 0.92024633474648,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.5,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 0.4737569224089384,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.003718756139278412,
"step": 80
},
{
"completion_length": 500.5625,
"epoch": 6.833333333333333,
"grad_norm": 0.5481275916099548,
"kl": 0.00398016459075734,
"learning_rate": 3.3719360358247053e-07,
"loss": 0.0,
"reward": 0.981081947684288,
"reward_std": 1.2141644582152367,
"rewards/concensus_correctness_reward_func": 0.055250002071261406,
"rewards/consensus_reward_func": 0.375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.49655068665742874,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.054281254298985004,
"step": 82
},
{
"completion_length": 395.84375,
"epoch": 7.0,
"grad_norm": 0.4870210886001587,
"kl": 0.005125438881805167,
"learning_rate": 3.29560745156861e-07,
"loss": 0.0,
"reward": 2.616707056760788,
"reward_std": 2.2652119155973196,
"rewards/concensus_correctness_reward_func": 1.2730624992400408,
"rewards/consensus_reward_func": 0.3125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.6221445724368095,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.3777499981224537,
"step": 84
},
{
"completion_length": 397.6875,
"epoch": 7.166666666666667,
"grad_norm": 1.8424495458602905,
"kl": 0.005850363930221647,
"learning_rate": 3.218444384962071e-07,
"loss": 0.0,
"reward": 0.7416995037347078,
"reward_std": 0.7169875521212816,
"rewards/concensus_correctness_reward_func": 0.030124999582767487,
"rewards/consensus_reward_func": 0.25,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.42598075792193413,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.019968749955296516,
"step": 86
},
{
"completion_length": 451.84375,
"epoch": 7.333333333333333,
"grad_norm": 1.3289145231246948,
"kl": 0.005223560554441065,
"learning_rate": 3.1405277694064305e-07,
"loss": 0.0,
"reward": 1.037564679980278,
"reward_std": 0.8682594746351242,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.5,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.4709709379822016,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.050968751311302185,
"step": 88
},
{
"completion_length": 507.65625,
"epoch": 7.5,
"grad_norm": 0.5467274785041809,
"kl": 0.00422226672526449,
"learning_rate": 3.0619393286718237e-07,
"loss": 0.0,
"reward": 2.325454168021679,
"reward_std": 2.455311745405197,
"rewards/concensus_correctness_reward_func": 1.270999999716878,
"rewards/consensus_reward_func": 0.375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0625,
"rewards/question_recreation_reward_func": 0.6971416063606739,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": -0.09581249579787254,
"step": 90
},
{
"completion_length": 431.34375,
"epoch": 7.666666666666667,
"grad_norm": 0.4258057773113251,
"kl": 0.005024322046665475,
"learning_rate": 2.98276149118022e-07,
"loss": 0.0,
"reward": 1.168196927756071,
"reward_std": 1.1126753464341164,
"rewards/concensus_correctness_reward_func": 0.02199999988079071,
"rewards/consensus_reward_func": 0.3125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0625,
"rewards/question_recreation_reward_func": 0.5554468911141157,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.03125,
"rewards/xmlcount_reward_func": 0.18450001627206802,
"step": 92
},
{
"completion_length": 420.5,
"epoch": 7.833333333333333,
"grad_norm": 3.349438428878784,
"kl": 0.008144651423208416,
"learning_rate": 2.903077303549399e-07,
"loss": 0.0,
"reward": 1.0509218061342835,
"reward_std": 1.0600119307637215,
"rewards/concensus_correctness_reward_func": 0.04312499985098839,
"rewards/consensus_reward_func": 0.25,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 0.39529681019484997,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.2374999923631549,
"step": 94
},
{
"completion_length": 434.96875,
"epoch": 8.0,
"grad_norm": 0.49676457047462463,
"kl": 0.00413627817761153,
"learning_rate": 2.822970343488516e-07,
"loss": 0.0,
"reward": 0.9490811452269554,
"reward_std": 0.8340496830642223,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.1875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.5972061259672046,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.16437500715255737,
"step": 96
},
{
"completion_length": 403.90625,
"epoch": 8.166666666666666,
"grad_norm": 0.5876235961914062,
"kl": 0.007058300951030105,
"learning_rate": 2.7425246321366205e-07,
"loss": 0.0,
"reward": 1.397300623357296,
"reward_std": 0.7023336328566074,
"rewards/concensus_correctness_reward_func": 0.020999999716877937,
"rewards/consensus_reward_func": 0.625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.583988162688911,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.15168750286102295,
"step": 98
},
{
"completion_length": 533.15625,
"epoch": 8.333333333333334,
"grad_norm": 0.429283082485199,
"kl": 0.004187059836112894,
"learning_rate": 2.661824545936089e-07,
"loss": 0.0,
"reward": 1.2581686060875654,
"reward_std": 1.170703399926424,
"rewards/concensus_correctness_reward_func": 0.02306249924004078,
"rewards/consensus_reward_func": 0.5,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.697043601423502,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": -0.008812484331429005,
"step": 100
},
{
"completion_length": 445.3125,
"epoch": 8.5,
"grad_norm": 0.5875679850578308,
"kl": 0.0034884795895777643,
"learning_rate": 2.58095472813339e-07,
"loss": 0.0,
"reward": 0.9280952122062445,
"reward_std": 0.9815445207059383,
"rewards/concensus_correctness_reward_func": 0.0403750017285347,
"rewards/consensus_reward_func": 0.1875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.5310951881110668,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.16912501025944948,
"step": 102
},
{
"completion_length": 383.84375,
"epoch": 8.666666666666666,
"grad_norm": 0.5602811574935913,
"kl": 0.00470353034324944,
"learning_rate": 2.5e-07,
"loss": 0.0,
"reward": 1.1036290470510721,
"reward_std": 1.118781317025423,
"rewards/concensus_correctness_reward_func": 0.015062499791383743,
"rewards/consensus_reward_func": 0.4375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 0.4021915583871305,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.10825000796467066,
"step": 104
},
{
"completion_length": 494.15625,
"epoch": 8.833333333333334,
"grad_norm": 0.45191463828086853,
"kl": 0.005157338426215574,
"learning_rate": 2.4190452718666105e-07,
"loss": 0.0,
"reward": 0.9766125809401274,
"reward_std": 0.8945337496697903,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.4375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0625,
"rewards/question_recreation_reward_func": 0.5054875910282135,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.028874988667666912,
"step": 106
},
{
"completion_length": 495.875,
"epoch": 9.0,
"grad_norm": 0.5252740383148193,
"kl": 0.003989085234934464,
"learning_rate": 2.3381754540639106e-07,
"loss": 0.0,
"reward": 1.4440684113651514,
"reward_std": 2.4300696589052677,
"rewards/concensus_correctness_reward_func": 0.6704375147819519,
"rewards/consensus_reward_func": 0.25,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.1875,
"rewards/question_recreation_reward_func": 0.4195683840662241,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.08343750424683094,
"step": 108
},
{
"completion_length": 548.71875,
"epoch": 9.166666666666666,
"grad_norm": 0.3460928201675415,
"kl": 0.004296147613786161,
"learning_rate": 2.2574753678633798e-07,
"loss": 0.0,
"reward": 1.181284163147211,
"reward_std": 0.9245094396173954,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.25,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 0.5359716564416885,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.2546875076368451,
"step": 110
},
{
"completion_length": 407.4375,
"epoch": 9.333333333333334,
"grad_norm": 0.49219369888305664,
"kl": 0.004819943045731634,
"learning_rate": 2.1770296565114846e-07,
"loss": 0.0,
"reward": 0.96442811191082,
"reward_std": 1.1890033707022667,
"rewards/concensus_correctness_reward_func": 0.051249999552965164,
"rewards/consensus_reward_func": 0.3125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.42905314173549414,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.03125,
"rewards/xmlcount_reward_func": 0.12475000228732824,
"step": 112
},
{
"completion_length": 462.34375,
"epoch": 9.5,
"grad_norm": 0.4612361788749695,
"kl": 0.004889295043540187,
"learning_rate": 2.0969226964506005e-07,
"loss": 0.0,
"reward": 0.9236417841166258,
"reward_std": 0.9137359261512756,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.5326105281710625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.00040626712143421173,
"step": 114
},
{
"completion_length": 436.25,
"epoch": 9.666666666666666,
"grad_norm": 0.42146608233451843,
"kl": 0.004314293968491256,
"learning_rate": 2.0172385088197803e-07,
"loss": 0.0,
"reward": 0.8642384810373187,
"reward_std": 0.9441619887948036,
"rewards/concensus_correctness_reward_func": 0.010999999940395355,
"rewards/consensus_reward_func": 0.0625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.5045822560787201,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.27053125388920307,
"step": 116
},
{
"completion_length": 414.46875,
"epoch": 9.833333333333334,
"grad_norm": 1.0195592641830444,
"kl": 0.007958032132592052,
"learning_rate": 1.9380606713281772e-07,
"loss": 0.0,
"reward": 2.222010016441345,
"reward_std": 2.534605525434017,
"rewards/concensus_correctness_reward_func": 0.625,
"rewards/consensus_reward_func": 0.4375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.1875,
"rewards/question_recreation_reward_func": 0.5085724969394505,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.4478124938905239,
"step": 118
},
{
"completion_length": 473.09375,
"epoch": 10.0,
"grad_norm": 0.5131120681762695,
"kl": 0.0043090580438729376,
"learning_rate": 1.859472230593569e-07,
"loss": 0.0,
"reward": 0.8962987270206213,
"reward_std": 1.2116570584475994,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.25,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.6050799824297428,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.02559376321732998,
"step": 120
},
{
"completion_length": 493.90625,
"epoch": 10.166666666666666,
"grad_norm": 0.563905656337738,
"kl": 0.0038846915995236486,
"learning_rate": 1.7815556150379296e-07,
"loss": 0.0,
"reward": 1.1194737702608109,
"reward_std": 1.1054411605000496,
"rewards/concensus_correctness_reward_func": 0.010999999940395355,
"rewards/consensus_reward_func": 0.4375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.5926925111562014,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.07828125357627869,
"step": 122
},
{
"completion_length": 433.03125,
"epoch": 10.333333333333334,
"grad_norm": 0.8397959470748901,
"kl": 0.0044599128887057304,
"learning_rate": 1.704392548431391e-07,
"loss": 0.0,
"reward": 1.2151572033762932,
"reward_std": 1.2529273331165314,
"rewards/concensus_correctness_reward_func": 0.0403750017285347,
"rewards/consensus_reward_func": 0.3125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.6728134155273438,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.1894687432795763,
"step": 124
},
{
"completion_length": 568.875,
"epoch": 10.5,
"grad_norm": 0.41829314827919006,
"kl": 0.004545757925370708,
"learning_rate": 1.6280639641752942e-07,
"loss": 0.0,
"reward": 1.2793281599879265,
"reward_std": 1.0452781841158867,
"rewards/concensus_correctness_reward_func": 0.014875000342726707,
"rewards/consensus_reward_func": 0.375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.42804690077900887,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.46140626445412636,
"step": 126
},
{
"completion_length": 476.28125,
"epoch": 10.666666666666666,
"grad_norm": 0.4802101254463196,
"kl": 0.004307686380343512,
"learning_rate": 1.552649920414233e-07,
"loss": 0.0,
"reward": 0.6176988333463669,
"reward_std": 0.7555880509316921,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.0625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0625,
"rewards/question_recreation_reward_func": 0.33288631308823824,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.14418751001358032,
"step": 128
},
{
"completion_length": 398.25,
"epoch": 10.833333333333334,
"grad_norm": 0.7297332882881165,
"kl": 0.005738754698541015,
"learning_rate": 1.47822951606611e-07,
"loss": 0.0,
"reward": 1.0000969618558884,
"reward_std": 0.8809923827648163,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.25,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0625,
"rewards/question_recreation_reward_func": 0.5464094616472721,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.1255625020712614,
"step": 130
},
{
"completion_length": 441.96875,
"epoch": 11.0,
"grad_norm": 0.4999024271965027,
"kl": 0.004563337104627863,
"learning_rate": 1.4048808078582942e-07,
"loss": 0.0,
"reward": 1.2324171178042889,
"reward_std": 1.470774844288826,
"rewards/concensus_correctness_reward_func": 0.025624999776482582,
"rewards/consensus_reward_func": 0.4375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0625,
"rewards/question_recreation_reward_func": 0.6081358455121517,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.0830312529578805,
"step": 132
},
{
"completion_length": 496.75,
"epoch": 11.166666666666666,
"grad_norm": 0.6821574568748474,
"kl": 0.00595690673799254,
"learning_rate": 1.3326807284568984e-07,
"loss": 0.0,
"reward": 1.0568560734391212,
"reward_std": 0.9344909712672234,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.25,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.6236373111605644,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.16759375110268593,
"step": 134
},
{
"completion_length": 458.25,
"epoch": 11.333333333333334,
"grad_norm": 0.37327247858047485,
"kl": 0.004472251705010422,
"learning_rate": 1.261705005775032e-07,
"loss": 0.0,
"reward": 0.8789136372506618,
"reward_std": 1.044730570167303,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.3125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0625,
"rewards/question_recreation_reward_func": 0.4194761496037245,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.03125,
"rewards/xmlcount_reward_func": 0.05318749602884054,
"step": 136
},
{
"completion_length": 449.96875,
"epoch": 11.5,
"grad_norm": 0.6042804718017578,
"kl": 0.004963608211255632,
"learning_rate": 1.1920280835446748e-07,
"loss": 0.0,
"reward": 0.7906294241547585,
"reward_std": 0.8892590645700693,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.25,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.4917856939136982,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.04884374141693115,
"step": 138
},
{
"completion_length": 428.78125,
"epoch": 11.666666666666666,
"grad_norm": 0.43670710921287537,
"kl": 0.004157427014433779,
"learning_rate": 1.123723043235491e-07,
"loss": 0.0,
"reward": 0.9911715611815453,
"reward_std": 1.0859125535935163,
"rewards/concensus_correctness_reward_func": 0.051249999552965164,
"rewards/consensus_reward_func": 0.375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.6306090354919434,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.0656874980777502,
"step": 140
},
{
"completion_length": 501.96875,
"epoch": 11.833333333333334,
"grad_norm": 0.36982962489128113,
"kl": 0.004740105330711231,
"learning_rate": 1.056861527402452e-07,
"loss": 0.0,
"reward": 1.4135117419064045,
"reward_std": 2.245922774076462,
"rewards/concensus_correctness_reward_func": 0.625,
"rewards/consensus_reward_func": 0.25,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.47179301269352436,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.051093748304992914,
"step": 142
},
{
"completion_length": 451.75,
"epoch": 12.0,
"grad_norm": 0.3974417448043823,
"kl": 0.0040429688087897375,
"learning_rate": 9.915136645426883e-08,
"loss": 0.0,
"reward": 0.7763103228062391,
"reward_std": 1.1694690249860287,
"rewards/concensus_correctness_reward_func": 0.05543750151991844,
"rewards/consensus_reward_func": 0.1875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 0.44059158489108086,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.03221874684095383,
"step": 144
},
{
"completion_length": 451.84375,
"epoch": 12.166666666666666,
"grad_norm": 0.4643774628639221,
"kl": 0.005262433871394023,
"learning_rate": 9.277479955403886e-08,
"loss": 0.0,
"reward": 0.8141104970127344,
"reward_std": 1.1598312184214592,
"rewards/concensus_correctness_reward_func": 0.06631249934434891,
"rewards/consensus_reward_func": 0.375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.3407667353749275,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.03203125298023224,
"step": 146
},
{
"completion_length": 474.53125,
"epoch": 12.333333333333334,
"grad_norm": 0.373272567987442,
"kl": 0.004522563278442249,
"learning_rate": 8.656314017768693e-08,
"loss": 0.0,
"reward": 1.0826607719063759,
"reward_std": 0.8991422578692436,
"rewards/concensus_correctness_reward_func": 0.010999999940395355,
"rewards/consensus_reward_func": 0.25,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.6882857866585255,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.11775000300258398,
"step": 148
},
{
"completion_length": 520.5,
"epoch": 12.5,
"grad_norm": 0.5331616997718811,
"kl": 0.004154504800681025,
"learning_rate": 8.052290349812419e-08,
"loss": 0.0,
"reward": 0.7753689587116241,
"reward_std": 0.855584591627121,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.3125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.4509002063423395,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.011968761682510376,
"step": 150
},
{
"completion_length": 427.5,
"epoch": 12.666666666666666,
"grad_norm": 0.4988049566745758,
"kl": 0.00503876109723933,
"learning_rate": 7.46604248895252e-08,
"loss": 0.0,
"reward": 2.609547968953848,
"reward_std": 2.4432314597070217,
"rewards/concensus_correctness_reward_func": 1.25,
"rewards/consensus_reward_func": 0.4375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0625,
"rewards/question_recreation_reward_func": 0.6731417663395405,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.17078125616535544,
"step": 152
},
{
"completion_length": 407.5,
"epoch": 12.833333333333334,
"grad_norm": 0.4662899971008301,
"kl": 0.004989988505258225,
"learning_rate": 6.898185328239467e-08,
"loss": 0.0,
"reward": 1.0029053278267384,
"reward_std": 0.9076509363949299,
"rewards/concensus_correctness_reward_func": 0.0403750017285347,
"rewards/consensus_reward_func": 0.3125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0625,
"rewards/question_recreation_reward_func": 0.46990528982132673,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.03125,
"rewards/xmlcount_reward_func": 0.08637500088661909,
"step": 154
},
{
"completion_length": 611.75,
"epoch": 13.0,
"grad_norm": 0.40727847814559937,
"kl": 0.0040196322806878015,
"learning_rate": 6.349314471418849e-08,
"loss": 0.0,
"reward": 0.8748770579695702,
"reward_std": 1.1846881732344627,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.25,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.5004395483992994,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.03125,
"rewards/xmlcount_reward_func": 0.09318749979138374,
"step": 156
},
{
"completion_length": 435.15625,
"epoch": 13.166666666666666,
"grad_norm": 0.778713583946228,
"kl": 0.005932875850703567,
"learning_rate": 5.8200056082253453e-08,
"loss": 0.0,
"reward": 1.1377633288502693,
"reward_std": 1.0723667368292809,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.518325824290514,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.03125,
"rewards/xmlcount_reward_func": 0.2131875054910779,
"step": 158
},
{
"completion_length": 405.28125,
"epoch": 13.333333333333334,
"grad_norm": 0.6267311573028564,
"kl": 0.00516361856716685,
"learning_rate": 5.310813910563644e-08,
"loss": 0.0,
"reward": 1.4793311096727848,
"reward_std": 1.0320269502699375,
"rewards/concensus_correctness_reward_func": 0.04312499985098839,
"rewards/consensus_reward_func": 0.375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0625,
"rewards/question_recreation_reward_func": 0.5567373372614384,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.03125,
"rewards/xmlcount_reward_func": 0.4107187641784549,
"step": 160
},
{
"completion_length": 417.375,
"epoch": 13.5,
"grad_norm": 0.7477406859397888,
"kl": 0.007013184251263738,
"learning_rate": 4.8222734502097655e-08,
"loss": 0.0,
"reward": 1.0498220138251781,
"reward_std": 1.0254092812538147,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.4375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.3807595409452915,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.03125,
"rewards/xmlcount_reward_func": 0.20031250617466867,
"step": 162
},
{
"completion_length": 524.9375,
"epoch": 13.666666666666666,
"grad_norm": 0.3479692041873932,
"kl": 0.003795241893385537,
"learning_rate": 4.35489663864359e-08,
"loss": 0.0,
"reward": 0.8717399332672358,
"reward_std": 0.986246095970273,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.4375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.5171773973852396,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": -0.09856249298900366,
"step": 164
},
{
"completion_length": 417.5,
"epoch": 13.833333333333334,
"grad_norm": 0.4246520698070526,
"kl": 0.00443370349239558,
"learning_rate": 3.90917368959989e-08,
"loss": 0.0,
"reward": 2.2922814451158047,
"reward_std": 2.770563669502735,
"rewards/concensus_correctness_reward_func": 1.2991249989718199,
"rewards/consensus_reward_func": 0.375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0625,
"rewards/question_recreation_reward_func": 0.5883127357810736,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": -0.048281239345669746,
"step": 166
},
{
"completion_length": 454.21875,
"epoch": 14.0,
"grad_norm": 0.6315698027610779,
"kl": 0.006786162994103506,
"learning_rate": 3.485572104901868e-08,
"loss": 0.0,
"reward": 0.9570262767374516,
"reward_std": 0.9035042636096478,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.0625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.5563387721776962,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.03125,
"rewards/xmlcount_reward_func": 0.30693749710917473,
"step": 168
},
{
"completion_length": 510.4375,
"epoch": 14.166666666666666,
"grad_norm": 0.34782856702804565,
"kl": 0.0037368796474765986,
"learning_rate": 3.08453618411631e-08,
"loss": 0.0,
"reward": 0.8143082866445184,
"reward_std": 1.0274437740445137,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.25,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.44208951387554407,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.10659373924136162,
"step": 170
},
{
"completion_length": 510.75,
"epoch": 14.333333333333334,
"grad_norm": 0.34581589698791504,
"kl": 0.0037871022359468043,
"learning_rate": 2.7064865585446433e-08,
"loss": 0.0,
"reward": 0.8895941041409969,
"reward_std": 1.2179592177271843,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.3125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.5555003546178341,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.005968746729195118,
"step": 172
},
{
"completion_length": 425.09375,
"epoch": 14.5,
"grad_norm": 1.2589985132217407,
"kl": 0.00627510278718546,
"learning_rate": 2.3518197500388276e-08,
"loss": 0.0,
"reward": 1.2998529449105263,
"reward_std": 0.9905166923999786,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.3125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.5617592297494411,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.3787187491543591,
"step": 174
},
{
"completion_length": 414.71875,
"epoch": 14.666666666666666,
"grad_norm": 0.5357417464256287,
"kl": 0.005580049444688484,
"learning_rate": 2.0209077551046976e-08,
"loss": 0.0,
"reward": 1.0539772361516953,
"reward_std": 1.2460423409938812,
"rewards/concensus_correctness_reward_func": 0.02199999988079071,
"rewards/consensus_reward_func": 0.375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.43419601768255234,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.20715624745935202,
"step": 176
},
{
"completion_length": 429.375,
"epoch": 14.833333333333334,
"grad_norm": 1.0408557653427124,
"kl": 0.005757150123827159,
"learning_rate": 1.7140976547289438e-08,
"loss": 0.0,
"reward": 0.9175567002967,
"reward_std": 0.979901023209095,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.4002129649743438,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.12671875953674316,
"step": 178
},
{
"completion_length": 420.3125,
"epoch": 15.0,
"grad_norm": 7.589697360992432,
"kl": 0.007644714612979442,
"learning_rate": 1.4317112503391432e-08,
"loss": 0.0,
"reward": 2.202411949634552,
"reward_std": 2.396846577525139,
"rewards/concensus_correctness_reward_func": 0.6762499995529652,
"rewards/consensus_reward_func": 0.5,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0625,
"rewards/question_recreation_reward_func": 0.5486306510865688,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.03125,
"rewards/xmlcount_reward_func": 0.38378125987946987,
"step": 180
},
{
"completion_length": 358.25,
"epoch": 15.166666666666666,
"grad_norm": 0.44812634587287903,
"kl": 0.006337179249385372,
"learning_rate": 1.174044726278478e-08,
"loss": 0.0,
"reward": 1.5023312643170357,
"reward_std": 0.868278194218874,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.5625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.5742063000798225,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.365625005797483,
"step": 182
},
{
"completion_length": 490.96875,
"epoch": 15.333333333333334,
"grad_norm": 0.4610471725463867,
"kl": 0.004114439827390015,
"learning_rate": 9.413683391492455e-09,
"loss": 0.0,
"reward": 0.9374719671905041,
"reward_std": 1.0686773546040058,
"rewards/concensus_correctness_reward_func": 0.04312499985098839,
"rewards/consensus_reward_func": 0.1875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.6183782331645489,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.07284376211464405,
"step": 184
},
{
"completion_length": 494.375,
"epoch": 15.5,
"grad_norm": 0.4306413233280182,
"kl": 0.005119932699017227,
"learning_rate": 7.339261343510206e-09,
"loss": 0.0,
"reward": 2.5885951071977615,
"reward_std": 2.4889354780316353,
"rewards/concensus_correctness_reward_func": 1.2648750003427267,
"rewards/consensus_reward_func": 0.5,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 0.5245639234781265,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.15853123925626278,
"step": 186
},
{
"completion_length": 517.625,
"epoch": 15.666666666666666,
"grad_norm": 0.5044927597045898,
"kl": 0.003461526444880292,
"learning_rate": 5.519356901107358e-09,
"loss": 0.0,
"reward": 0.6680111261084676,
"reward_std": 1.0589893124997616,
"rewards/concensus_correctness_reward_func": 0.0403750017285347,
"rewards/consensus_reward_func": 0.1875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0625,
"rewards/question_recreation_reward_func": 0.5050736283883452,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.03125,
"rewards/xmlcount_reward_func": -0.15868749096989632,
"step": 188
},
{
"completion_length": 502.28125,
"epoch": 15.833333333333334,
"grad_norm": 0.4112931489944458,
"kl": 0.004242730807163753,
"learning_rate": 3.95587889273144e-09,
"loss": 0.0,
"reward": 1.0356206335127354,
"reward_std": 1.2721525505185127,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.25,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.5705581326037645,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.15256250742822886,
"step": 190
},
{
"completion_length": 447.75,
"epoch": 16.0,
"grad_norm": 0.5173096060752869,
"kl": 0.0055172459105961025,
"learning_rate": 2.6504671909109988e-09,
"loss": 0.0,
"reward": 1.2947994247078896,
"reward_std": 0.9049638472497463,
"rewards/concensus_correctness_reward_func": 0.010999999940395355,
"rewards/consensus_reward_func": 0.625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.41786191053688526,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.22531251050531864,
"step": 192
},
{
"completion_length": 486.375,
"epoch": 16.166666666666668,
"grad_norm": 0.47947174310684204,
"kl": 0.004277107727830298,
"learning_rate": 1.6044909922555972e-09,
"loss": 0.0,
"reward": 0.6717201061546803,
"reward_std": 0.9776397682726383,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.1875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.4134075944311917,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.05518750101327896,
"step": 194
},
{
"completion_length": 426.0,
"epoch": 16.333333333333332,
"grad_norm": 0.6205836534500122,
"kl": 0.0041899179050233215,
"learning_rate": 8.19047381357657e-10,
"loss": 0.0,
"reward": 1.1252982020378113,
"reward_std": 0.8293185122311115,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.6142044588923454,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.03125,
"rewards/xmlcount_reward_func": 0.10484375571832061,
"step": 196
},
{
"completion_length": 475.28125,
"epoch": 16.5,
"grad_norm": 0.4393874406814575,
"kl": 0.004755317640956491,
"learning_rate": 2.949601801023327e-10,
"loss": 0.0,
"reward": 2.9790390357375145,
"reward_std": 2.1492477003484964,
"rewards/concensus_correctness_reward_func": 1.8859999999403954,
"rewards/consensus_reward_func": 0.375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0625,
"rewards/question_recreation_reward_func": 0.6341639496386051,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.021374999545514584,
"step": 198
},
{
"completion_length": 396.65625,
"epoch": 16.666666666666668,
"grad_norm": 0.9207046031951904,
"kl": 0.005724833768908866,
"learning_rate": 3.2779083591949474e-11,
"loss": 0.0,
"reward": 1.5880182441323996,
"reward_std": 1.0737088397145271,
"rewards/concensus_correctness_reward_func": 0.0403750017285347,
"rewards/consensus_reward_func": 0.625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0625,
"rewards/question_recreation_reward_func": 0.5196432434022427,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.34050000831484795,
"step": 200
},
{
"epoch": 16.666666666666668,
"step": 200,
"total_flos": 0.0,
"train_loss": 4.575966534048348e-06,
"train_runtime": 9569.7045,
"train_samples_per_second": 0.334,
"train_steps_per_second": 0.021
}
],
"logging_steps": 2,
"max_steps": 200,
"num_input_tokens_seen": 0,
"num_train_epochs": 17,
"save_steps": 25,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}