wyceee's picture
End of training
a95ba00 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.571428571428571,
"eval_steps": 500,
"global_step": 100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 326.21875,
"epoch": 0.11428571428571428,
"grad_norm": 2.7555673122406006,
"kl": 0.0,
"learning_rate": 1.6666666666666665e-07,
"loss": -0.0,
"reward": 3.2572884149849415,
"reward_std": 2.711772508919239,
"rewards/concensus_correctness_reward_func": 1.3658749970927602,
"rewards/consensus_reward_func": 0.5625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 0.676913361530751,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.48012501280754805,
"step": 2
},
{
"completion_length": 223.25,
"epoch": 0.22857142857142856,
"grad_norm": 2.6882662773132324,
"kl": 0.0006823752573836828,
"learning_rate": 5e-07,
"loss": 0.0,
"reward": 6.5029780976474285,
"reward_std": 3.8371810587123036,
"rewards/concensus_correctness_reward_func": 3.9408124699257314,
"rewards/consensus_reward_func": 0.5625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.6875,
"rewards/question_recreation_reward_func": 0.4921030206605792,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.7575625074096024,
"step": 4
},
{
"completion_length": 327.5625,
"epoch": 0.34285714285714286,
"grad_norm": 2.6698217391967773,
"kl": 0.0007466921615559841,
"learning_rate": 4.994757065594279e-07,
"loss": 0.0,
"reward": 3.1852196622639894,
"reward_std": 1.617547769099474,
"rewards/concensus_correctness_reward_func": 0.972874996252358,
"rewards/consensus_reward_func": 0.4375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.375,
"rewards/question_recreation_reward_func": 0.6452508568763733,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.7077187523245811,
"step": 6
},
{
"completion_length": 300.34375,
"epoch": 0.45714285714285713,
"grad_norm": 2.2753820419311523,
"kl": 0.0009206273971358314,
"learning_rate": 4.979050253066063e-07,
"loss": 0.0,
"reward": 4.944724701344967,
"reward_std": 4.053568044560961,
"rewards/concensus_correctness_reward_func": 2.3233749866485596,
"rewards/consensus_reward_func": 0.4375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.875,
"rewards/question_recreation_reward_func": 0.6202871967107058,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.6416875068098307,
"step": 8
},
{
"completion_length": 251.34375,
"epoch": 0.5714285714285714,
"grad_norm": 3.361114263534546,
"kl": 0.001550516844872618,
"learning_rate": 4.952945442245597e-07,
"loss": 0.0,
"reward": 4.728278212249279,
"reward_std": 4.151839345460758,
"rewards/concensus_correctness_reward_func": 2.2346875024959445,
"rewards/consensus_reward_func": 0.75,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.5,
"rewards/question_recreation_reward_func": 0.5102781374007463,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.6864375146105886,
"step": 10
},
{
"completion_length": 283.0625,
"epoch": 0.6857142857142857,
"grad_norm": 2.796189308166504,
"kl": 0.0018549648048065137,
"learning_rate": 4.916552125781528e-07,
"loss": 0.0,
"reward": 5.428475089371204,
"reward_std": 2.1235571010038257,
"rewards/concensus_correctness_reward_func": 2.752250012010336,
"rewards/consensus_reward_func": 0.625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.75,
"rewards/question_recreation_reward_func": 0.5671625286340714,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.6871875002980232,
"step": 12
},
{
"completion_length": 292.625,
"epoch": 0.8,
"grad_norm": 2.12504243850708,
"kl": 0.002416994764644187,
"learning_rate": 4.870022949890676e-07,
"loss": 0.0,
"reward": 3.0031582564115524,
"reward_std": 1.936399682686897,
"rewards/concensus_correctness_reward_func": 0.9825624963268638,
"rewards/consensus_reward_func": 0.3125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.375,
"rewards/question_recreation_reward_func": 0.52943952428177,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.7880312651395798,
"step": 14
},
{
"completion_length": 236.5,
"epoch": 0.9142857142857143,
"grad_norm": 3.788289785385132,
"kl": 0.004197390335320961,
"learning_rate": 4.81355307410676e-07,
"loss": 0.0,
"reward": 4.460998922586441,
"reward_std": 3.673946577589959,
"rewards/concensus_correctness_reward_func": 2.212312502786517,
"rewards/consensus_reward_func": 0.5,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.4375,
"rewards/question_recreation_reward_func": 0.5744363954290748,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.6742500034160912,
"step": 16
},
{
"completion_length": 223.5,
"epoch": 1.0,
"grad_norm": 2.340423345565796,
"kl": 0.00462426839900824,
"learning_rate": 4.747379352713488e-07,
"loss": 0.0,
"reward": 5.577042788267136,
"reward_std": 2.891117551790861,
"rewards/concensus_correctness_reward_func": 3.5999166841308274,
"rewards/consensus_reward_func": 0.3333333333333333,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.3333333333333333,
"rewards/question_recreation_reward_func": 0.5223344924549261,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.10416666666666667,
"rewards/xmlcount_reward_func": 0.6839583379526933,
"step": 18
},
{
"completion_length": 285.375,
"epoch": 1.1142857142857143,
"grad_norm": 2.9781136512756348,
"kl": 0.005876571987755597,
"learning_rate": 4.6717793412953776e-07,
"loss": 0.0,
"reward": 3.856148846447468,
"reward_std": 3.056236045435071,
"rewards/concensus_correctness_reward_func": 1.7554374812170863,
"rewards/consensus_reward_func": 0.4375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.4375,
"rewards/question_recreation_reward_func": 0.5816801311448216,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.03125,
"rewards/xmlcount_reward_func": 0.6127812552731484,
"step": 20
},
{
"completion_length": 258.78125,
"epoch": 1.2285714285714286,
"grad_norm": 2.790374279022217,
"kl": 0.008101415849523619,
"learning_rate": 4.5870701325731773e-07,
"loss": 0.0,
"reward": 5.01688564568758,
"reward_std": 2.427376964595169,
"rewards/concensus_correctness_reward_func": 2.506874994840473,
"rewards/consensus_reward_func": 0.4375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.875,
"rewards/question_recreation_reward_func": 0.5136668155901134,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.6682187579572201,
"step": 22
},
{
"completion_length": 240.25,
"epoch": 1.342857142857143,
"grad_norm": 2.5987563133239746,
"kl": 0.012570352992042899,
"learning_rate": 4.4936070264068016e-07,
"loss": 0.0,
"reward": 4.539519101381302,
"reward_std": 2.6822728496044874,
"rewards/concensus_correctness_reward_func": 1.7519375048577785,
"rewards/consensus_reward_func": 0.9375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.5625,
"rewards/question_recreation_reward_func": 0.5148315682308748,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.7258749920874834,
"step": 24
},
{
"completion_length": 297.03125,
"epoch": 1.457142857142857,
"grad_norm": 2.034766912460327,
"kl": 0.010542554169660434,
"learning_rate": 4.391782039544238e-07,
"loss": 0.0,
"reward": 5.3825334794819355,
"reward_std": 4.827194595243782,
"rewards/concensus_correctness_reward_func": 2.6841250059515005,
"rewards/consensus_reward_func": 0.5625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.6875,
"rewards/question_recreation_reward_func": 0.7439396986737847,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.6419687559828162,
"step": 26
},
{
"completion_length": 282.1875,
"epoch": 1.5714285714285714,
"grad_norm": 4.419146537780762,
"kl": 0.010264099051710218,
"learning_rate": 4.282022261367073e-07,
"loss": 0.0,
"reward": 2.5538329035043716,
"reward_std": 0.86597695434466,
"rewards/concensus_correctness_reward_func": 0.33731249440461397,
"rewards/consensus_reward_func": 0.6875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 0.6221455032937229,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.6099999930593185,
"step": 28
},
{
"completion_length": 285.875,
"epoch": 1.6857142857142857,
"grad_norm": 2.3088910579681396,
"kl": 0.014536559290718287,
"learning_rate": 4.1647880625292027e-07,
"loss": 0.0,
"reward": 7.595606815069914,
"reward_std": 2.5936438450589776,
"rewards/concensus_correctness_reward_func": 4.180062495172024,
"rewards/consensus_reward_func": 1.125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.8125,
"rewards/question_recreation_reward_func": 0.7352005220018327,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.6959687564522028,
"step": 30
},
{
"completion_length": 260.65625,
"epoch": 1.8,
"grad_norm": 2.6501407623291016,
"kl": 0.018609989958349615,
"learning_rate": 4.040571164002318e-07,
"loss": 0.0,
"reward": 4.827333331108093,
"reward_std": 2.085965577978641,
"rewards/concensus_correctness_reward_func": 2.4736250173300505,
"rewards/consensus_reward_func": 0.4375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.6875,
"rewards/question_recreation_reward_func": 0.5945833660662174,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.03125,
"rewards/xmlcount_reward_func": 0.6028750017285347,
"step": 32
},
{
"completion_length": 262.71875,
"epoch": 1.9142857142857141,
"grad_norm": 3.4099960327148438,
"kl": 0.018288226914592087,
"learning_rate": 3.909892574627266e-07,
"loss": 0.0,
"reward": 5.539210200309753,
"reward_std": 2.2049794927006587,
"rewards/concensus_correctness_reward_func": 2.6087499796412885,
"rewards/consensus_reward_func": 0.8125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.5625,
"rewards/question_recreation_reward_func": 0.6833352446556091,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.8096250100061297,
"step": 34
},
{
"completion_length": 315.9166666666667,
"epoch": 2.0,
"grad_norm": 1.4127824306488037,
"kl": 0.0216289390809834,
"learning_rate": 3.773300405821908e-07,
"loss": 0.0,
"reward": 3.2096741100152335,
"reward_std": 1.8589469492435455,
"rewards/concensus_correctness_reward_func": 0.7585000023245811,
"rewards/consensus_reward_func": 0.4166666666666667,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.6666666666666666,
"rewards/question_recreation_reward_func": 0.6264241177899142,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.6789166710029045,
"step": 36
},
{
"completion_length": 235.25,
"epoch": 2.1142857142857143,
"grad_norm": 2.856210470199585,
"kl": 0.03234067652374506,
"learning_rate": 3.6313675726113475e-07,
"loss": 0.0,
"reward": 4.8823426477611065,
"reward_std": 4.0293696410954,
"rewards/concensus_correctness_reward_func": 2.1174999997019768,
"rewards/consensus_reward_func": 0.625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.625,
"rewards/question_recreation_reward_func": 0.6155301326652989,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.09375,
"rewards/xmlcount_reward_func": 0.8055625003762543,
"step": 38
},
{
"completion_length": 289.0,
"epoch": 2.2285714285714286,
"grad_norm": 2.498208999633789,
"kl": 0.032537119346670806,
"learning_rate": 3.484689390623218e-07,
"loss": 0.0,
"reward": 3.257258500903845,
"reward_std": 1.8479195050895214,
"rewards/concensus_correctness_reward_func": 0.9986874996393453,
"rewards/consensus_reward_func": 0.75,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 0.6424147803336382,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.03125,
"rewards/xmlcount_reward_func": 0.5849062511697412,
"step": 40
},
{
"completion_length": 286.0,
"epoch": 2.342857142857143,
"grad_norm": 3.0837841033935547,
"kl": 0.03241805831203237,
"learning_rate": 3.3338810791270517e-07,
"loss": 0.0,
"reward": 6.6657252591103315,
"reward_std": 5.738411407917738,
"rewards/concensus_correctness_reward_func": 4.001499989069998,
"rewards/consensus_reward_func": 0.5625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.75,
"rewards/question_recreation_reward_func": 0.6340690106153488,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.6551562692038715,
"step": 42
},
{
"completion_length": 240.5,
"epoch": 2.4571428571428573,
"grad_norm": 2.2751593589782715,
"kl": 0.027716133918147534,
"learning_rate": 3.179575180590857e-07,
"loss": 0.0,
"reward": 3.8581665493547916,
"reward_std": 1.8522115424275398,
"rewards/concensus_correctness_reward_func": 1.1526875039562583,
"rewards/consensus_reward_func": 0.5,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.5625,
"rewards/question_recreation_reward_func": 0.5496665136888623,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.109375,
"rewards/xmlcount_reward_func": 0.9839375028386712,
"step": 44
},
{
"completion_length": 287.8125,
"epoch": 2.571428571428571,
"grad_norm": 2.325424909591675,
"kl": 0.03136032959446311,
"learning_rate": 3.022418907578188e-07,
"loss": 0.0,
"reward": 5.811520978808403,
"reward_std": 2.1761377695947886,
"rewards/concensus_correctness_reward_func": 3.5759375113993883,
"rewards/consensus_reward_func": 0.5625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.375,
"rewards/question_recreation_reward_func": 0.5898647699505091,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.6925937533378601,
"step": 46
},
{
"completion_length": 264.09375,
"epoch": 2.685714285714286,
"grad_norm": 2.799055814743042,
"kl": 0.035045830823946744,
"learning_rate": 2.863071428113726e-07,
"loss": 0.0,
"reward": 5.103824369609356,
"reward_std": 3.5301670129410923,
"rewards/concensus_correctness_reward_func": 2.3825000133365393,
"rewards/consensus_reward_func": 0.8125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.625,
"rewards/question_recreation_reward_func": 0.5504180546849966,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.6865312550216913,
"step": 48
},
{
"completion_length": 280.375,
"epoch": 2.8,
"grad_norm": 2.319396495819092,
"kl": 0.033160059072542936,
"learning_rate": 2.7022011009035107e-07,
"loss": 0.0,
"reward": 4.136773347854614,
"reward_std": 1.7412771796807647,
"rewards/concensus_correctness_reward_func": 1.2328750090673566,
"rewards/consensus_reward_func": 0.75,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.625,
"rewards/question_recreation_reward_func": 0.7363045308738947,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.09375,
"rewards/xmlcount_reward_func": 0.6988437669351697,
"step": 50
},
{
"completion_length": 252.5,
"epoch": 2.914285714285714,
"grad_norm": 2.2659144401550293,
"kl": 0.08391272573499009,
"learning_rate": 2.540482672006254e-07,
"loss": 0.0001,
"reward": 5.248130708932877,
"reward_std": 2.9013717267662287,
"rewards/concensus_correctness_reward_func": 2.086687508970499,
"rewards/consensus_reward_func": 0.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.75,
"rewards/question_recreation_reward_func": 0.6195994764566422,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.125,
"rewards/xmlcount_reward_func": 0.7918437521439046,
"step": 52
},
{
"completion_length": 336.3333333333333,
"epoch": 3.0,
"grad_norm": 1.9638556241989136,
"kl": 0.0329820365489771,
"learning_rate": 2.37859444471388e-07,
"loss": 0.0,
"reward": 2.897741069396337,
"reward_std": 1.3238216874500115,
"rewards/concensus_correctness_reward_func": 0.4622500070060293,
"rewards/consensus_reward_func": 0.5,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.5833333333333334,
"rewards/question_recreation_reward_func": 0.7555743406216303,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.5965833238636454,
"step": 54
},
{
"completion_length": 297.90625,
"epoch": 3.1142857142857143,
"grad_norm": 2.512716770172119,
"kl": 0.07957816089037806,
"learning_rate": 2.2172154345117894e-07,
"loss": 0.0001,
"reward": 5.407194800674915,
"reward_std": 2.5112848294666037,
"rewards/concensus_correctness_reward_func": 2.6179999876767397,
"rewards/consensus_reward_func": 1.0625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.625,
"rewards/question_recreation_reward_func": 0.6985698798671365,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.3562500039115548,
"step": 56
},
{
"completion_length": 261.375,
"epoch": 3.2285714285714286,
"grad_norm": 2.727494716644287,
"kl": 0.051797536259982735,
"learning_rate": 2.0570225210519433e-07,
"loss": 0.0001,
"reward": 4.303291346877813,
"reward_std": 3.120794242247939,
"rewards/concensus_correctness_reward_func": 1.5453749848529696,
"rewards/consensus_reward_func": 0.625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.8125,
"rewards/question_recreation_reward_func": 0.5874788034707308,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.078125,
"rewards/xmlcount_reward_func": 0.6548125031404197,
"step": 58
},
{
"completion_length": 312.5,
"epoch": 3.342857142857143,
"grad_norm": 2.5867483615875244,
"kl": 0.045177310064900666,
"learning_rate": 1.8986876090843664e-07,
"loss": 0.0,
"reward": 6.679476020857692,
"reward_std": 6.590652715298347,
"rewards/concensus_correctness_reward_func": 3.911812473088503,
"rewards/consensus_reward_func": 0.625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.8125,
"rewards/question_recreation_reward_func": 0.6978510078042746,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.5698124994523823,
"step": 60
},
{
"completion_length": 275.46875,
"epoch": 3.4571428571428573,
"grad_norm": 2.2337958812713623,
"kl": 0.05546386865898967,
"learning_rate": 1.7428748102551234e-07,
"loss": 0.0001,
"reward": 4.224702462553978,
"reward_std": 2.568043567443965,
"rewards/concensus_correctness_reward_func": 1.6509375016321428,
"rewards/consensus_reward_func": 0.5625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.5,
"rewards/question_recreation_reward_func": 0.5963900072965771,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.8680000007152557,
"step": 62
},
{
"completion_length": 254.375,
"epoch": 3.571428571428571,
"grad_norm": 2.5836949348449707,
"kl": 0.06771399604622275,
"learning_rate": 1.5902376575912814e-07,
"loss": 0.0001,
"reward": 5.309353556483984,
"reward_std": 2.0958344470709562,
"rewards/concensus_correctness_reward_func": 2.4754375047050416,
"rewards/consensus_reward_func": 0.9375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.5625,
"rewards/question_recreation_reward_func": 0.5066348570398986,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.09375,
"rewards/xmlcount_reward_func": 0.7335312599316239,
"step": 64
},
{
"completion_length": 294.90625,
"epoch": 3.685714285714286,
"grad_norm": 2.2413110733032227,
"kl": 0.058190350187942386,
"learning_rate": 1.4414163643562753e-07,
"loss": 0.0001,
"reward": 4.059148486703634,
"reward_std": 2.2921111752657453,
"rewards/concensus_correctness_reward_func": 1.5227500088512897,
"rewards/consensus_reward_func": 0.4375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.5625,
"rewards/question_recreation_reward_func": 0.7638359684497118,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.7569375038146973,
"step": 66
},
{
"completion_length": 277.53125,
"epoch": 3.8,
"grad_norm": 2.7358996868133545,
"kl": 0.07721107231918722,
"learning_rate": 1.2970351387729872e-07,
"loss": 0.0001,
"reward": 3.4795289039611816,
"reward_std": 2.1741816513240337,
"rewards/concensus_correctness_reward_func": 0.8386874985590111,
"rewards/consensus_reward_func": 0.75,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 0.6809976994991302,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.078125,
"rewards/xmlcount_reward_func": 0.8817187771201134,
"step": 68
},
{
"completion_length": 256.0625,
"epoch": 3.914285714285714,
"grad_norm": 4.087591648101807,
"kl": 0.0788359681610018,
"learning_rate": 1.1576995658775404e-07,
"loss": 0.0001,
"reward": 6.083272695541382,
"reward_std": 4.112470694584772,
"rewards/concensus_correctness_reward_func": 3.4234374810475856,
"rewards/consensus_reward_func": 0.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.3125,
"rewards/question_recreation_reward_func": 0.5136476922780275,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.125,
"rewards/xmlcount_reward_func": 0.8336875168606639,
"step": 70
},
{
"completion_length": 276.5,
"epoch": 4.0,
"grad_norm": 1.627131700515747,
"kl": 0.11795352476959427,
"learning_rate": 1.0239940674851941e-07,
"loss": 0.0001,
"reward": 5.3608784476916,
"reward_std": 2.300887676576773,
"rewards/concensus_correctness_reward_func": 1.9943333491683006,
"rewards/consensus_reward_func": 1.0833333333333333,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.75,
"rewards/question_recreation_reward_func": 0.6572117364654938,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.08333333333333333,
"rewards/xmlcount_reward_func": 0.7926666811108589,
"step": 72
},
{
"completion_length": 353.4375,
"epoch": 4.114285714285714,
"grad_norm": 4.412367820739746,
"kl": 0.08782886900007725,
"learning_rate": 8.964794509221507e-08,
"loss": 0.0001,
"reward": 4.319345578551292,
"reward_std": 1.4791212249547243,
"rewards/concensus_correctness_reward_func": 1.8143750003073364,
"rewards/consensus_reward_func": 0.625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.4375,
"rewards/question_recreation_reward_func": 0.6734706219285727,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.706500010099262,
"step": 74
},
{
"completion_length": 298.5,
"epoch": 4.228571428571429,
"grad_norm": 43.921119689941406,
"kl": 0.09398894105106592,
"learning_rate": 7.756905568047392e-08,
"loss": 0.0001,
"reward": 5.068341612815857,
"reward_std": 2.5960501823574305,
"rewards/concensus_correctness_reward_func": 2.2858750016748672,
"rewards/consensus_reward_func": 0.8125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.6875,
"rewards/question_recreation_reward_func": 0.6213102764450014,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.6142812587786466,
"step": 76
},
{
"completion_length": 279.4375,
"epoch": 4.3428571428571425,
"grad_norm": 3.3324248790740967,
"kl": 0.12617466738447547,
"learning_rate": 6.621340157319996e-08,
"loss": 0.0001,
"reward": 6.855641521513462,
"reward_std": 5.589063869789243,
"rewards/concensus_correctness_reward_func": 3.5954999728128314,
"rewards/consensus_reward_func": 1.125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.6875,
"rewards/question_recreation_reward_func": 0.6044852556660771,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.078125,
"rewards/xmlcount_reward_func": 0.7650312539190054,
"step": 78
},
{
"completion_length": 282.25,
"epoch": 4.457142857142857,
"grad_norm": 2.391388416290283,
"kl": 0.0781217070762068,
"learning_rate": 5.5628612330087724e-08,
"loss": 0.0001,
"reward": 5.143940486013889,
"reward_std": 3.1008094910066575,
"rewards/concensus_correctness_reward_func": 2.1685625007376075,
"rewards/consensus_reward_func": 0.6875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.8125,
"rewards/question_recreation_reward_func": 0.5825029462575912,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.109375,
"rewards/xmlcount_reward_func": 0.7835000064224005,
"step": 80
},
{
"completion_length": 250.875,
"epoch": 4.571428571428571,
"grad_norm": 3.275106906890869,
"kl": 0.08860545780044049,
"learning_rate": 4.5859084235697235e-08,
"loss": 0.0001,
"reward": 3.176556244492531,
"reward_std": 1.5254050176981764,
"rewards/concensus_correctness_reward_func": 0.8501874984940514,
"rewards/consensus_reward_func": 0.5625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 0.5377749832696281,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.109375,
"rewards/xmlcount_reward_func": 0.9917187504470348,
"step": 82
},
{
"completion_length": 252.65625,
"epoch": 4.685714285714286,
"grad_norm": 5.577023029327393,
"kl": 0.14010742434766144,
"learning_rate": 3.6945794086007705e-08,
"loss": 0.0001,
"reward": 4.1389394868165255,
"reward_std": 2.314908188767731,
"rewards/concensus_correctness_reward_func": 1.5294374911900377,
"rewards/consensus_reward_func": 0.6875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.625,
"rewards/question_recreation_reward_func": 0.560220692306757,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.078125,
"rewards/xmlcount_reward_func": 0.6586562575539574,
"step": 84
},
{
"completion_length": 257.34375,
"epoch": 4.8,
"grad_norm": 2.5406899452209473,
"kl": 0.09263441083021462,
"learning_rate": 2.892612731749414e-08,
"loss": 0.0001,
"reward": 3.948366153985262,
"reward_std": 1.6628333161497721,
"rewards/concensus_correctness_reward_func": 1.1678125290200114,
"rewards/consensus_reward_func": 0.5625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.6875,
"rewards/question_recreation_reward_func": 0.6303973635658622,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.09375,
"rewards/xmlcount_reward_func": 0.8064062613993883,
"step": 86
},
{
"completion_length": 266.59375,
"epoch": 4.914285714285715,
"grad_norm": 2.87119197845459,
"kl": 0.11624281201511621,
"learning_rate": 2.183372119961499e-08,
"loss": 0.0001,
"reward": 4.197016902267933,
"reward_std": 2.4855765970423818,
"rewards/concensus_correctness_reward_func": 1.5803749952465296,
"rewards/consensus_reward_func": 0.4375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.5,
"rewards/question_recreation_reward_func": 0.6657043690793216,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.078125,
"rewards/xmlcount_reward_func": 0.9353125058114529,
"step": 88
},
{
"completion_length": 259.5833333333333,
"epoch": 5.0,
"grad_norm": 3.3491158485412598,
"kl": 0.16860986345758042,
"learning_rate": 1.5698323748414122e-08,
"loss": 0.0001,
"reward": 7.5917567908763885,
"reward_std": 5.551452632372578,
"rewards/concensus_correctness_reward_func": 4.24174995906651,
"rewards/consensus_reward_func": 0.8333333333333334,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.8333333333333334,
"rewards/question_recreation_reward_func": 0.7116317736605803,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.041666666666666664,
"rewards/xmlcount_reward_func": 0.9300416857004166,
"step": 90
},
{
"completion_length": 259.78125,
"epoch": 5.114285714285714,
"grad_norm": 4.982123851776123,
"kl": 0.2259263969026506,
"learning_rate": 1.054566895300324e-08,
"loss": 0.0002,
"reward": 4.731869850307703,
"reward_std": 3.6210765979485586,
"rewards/concensus_correctness_reward_func": 1.8394375070929527,
"rewards/consensus_reward_func": 0.75,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.625,
"rewards/question_recreation_reward_func": 0.6873698411509395,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.7831875099800527,
"step": 92
},
{
"completion_length": 309.03125,
"epoch": 5.228571428571429,
"grad_norm": 2.2618565559387207,
"kl": 0.2819003712502308,
"learning_rate": 6.397368838268496e-09,
"loss": 0.0003,
"reward": 3.303976181894541,
"reward_std": 2.2868103915825486,
"rewards/concensus_correctness_reward_func": 0.8359999973326921,
"rewards/consensus_reward_func": 0.8125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 0.6375387134030461,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.09375,
"rewards/xmlcount_reward_func": 0.6741875065490603,
"step": 94
},
{
"completion_length": 263.03125,
"epoch": 5.3428571428571425,
"grad_norm": 2.844130039215088,
"kl": 0.059587346273474395,
"learning_rate": 3.2708228165273244e-09,
"loss": 0.0001,
"reward": 5.61201386898756,
"reward_std": 4.463006908656098,
"rewards/concensus_correctness_reward_func": 2.957062483765185,
"rewards/consensus_reward_func": 0.625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.625,
"rewards/question_recreation_reward_func": 0.5966701377183199,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.09375,
"rewards/xmlcount_reward_func": 0.7145312689244747,
"step": 96
},
{
"completion_length": 250.0625,
"epoch": 5.457142857142857,
"grad_norm": 2.9318857192993164,
"kl": 0.06650862575042993,
"learning_rate": 1.1791447083465133e-09,
"loss": 0.0001,
"reward": 5.33496169000864,
"reward_std": 1.7955414667958394,
"rewards/concensus_correctness_reward_func": 2.302812503403402,
"rewards/consensus_reward_func": 0.625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.875,
"rewards/question_recreation_reward_func": 0.6075867600739002,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.8620625035837293,
"step": 98
},
{
"completion_length": 270.4375,
"epoch": 5.571428571428571,
"grad_norm": 2.9673755168914795,
"kl": 0.08620016509667039,
"learning_rate": 1.3110773862126667e-10,
"loss": 0.0001,
"reward": 4.476045485585928,
"reward_std": 3.1387285026721656,
"rewards/concensus_correctness_reward_func": 1.7854374921880662,
"rewards/consensus_reward_func": 0.5,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.5625,
"rewards/question_recreation_reward_func": 0.606795561965555,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.109375,
"rewards/xmlcount_reward_func": 0.9119375087320805,
"step": 100
},
{
"epoch": 5.571428571428571,
"step": 100,
"total_flos": 0.0,
"train_loss": 5.454165089759044e-05,
"train_runtime": 1795.073,
"train_samples_per_second": 0.891,
"train_steps_per_second": 0.056
}
],
"logging_steps": 2,
"max_steps": 100,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 25,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}