|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 5.571428571428571, |
|
"eval_steps": 500, |
|
"global_step": 100, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 326.21875, |
|
"epoch": 0.11428571428571428, |
|
"grad_norm": 2.7555673122406006, |
|
"kl": 0.0, |
|
"learning_rate": 1.6666666666666665e-07, |
|
"loss": -0.0, |
|
"reward": 3.2572884149849415, |
|
"reward_std": 2.711772508919239, |
|
"rewards/concensus_correctness_reward_func": 1.3658749970927602, |
|
"rewards/consensus_reward_func": 0.5625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.125, |
|
"rewards/question_recreation_reward_func": 0.676913361530751, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.48012501280754805, |
|
"step": 2 |
|
}, |
|
{ |
|
"completion_length": 223.25, |
|
"epoch": 0.22857142857142856, |
|
"grad_norm": 2.6882662773132324, |
|
"kl": 0.0006823752573836828, |
|
"learning_rate": 5e-07, |
|
"loss": 0.0, |
|
"reward": 6.5029780976474285, |
|
"reward_std": 3.8371810587123036, |
|
"rewards/concensus_correctness_reward_func": 3.9408124699257314, |
|
"rewards/consensus_reward_func": 0.5625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.6875, |
|
"rewards/question_recreation_reward_func": 0.4921030206605792, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.7575625074096024, |
|
"step": 4 |
|
}, |
|
{ |
|
"completion_length": 327.5625, |
|
"epoch": 0.34285714285714286, |
|
"grad_norm": 2.6698217391967773, |
|
"kl": 0.0007466921615559841, |
|
"learning_rate": 4.994757065594279e-07, |
|
"loss": 0.0, |
|
"reward": 3.1852196622639894, |
|
"reward_std": 1.617547769099474, |
|
"rewards/concensus_correctness_reward_func": 0.972874996252358, |
|
"rewards/consensus_reward_func": 0.4375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.375, |
|
"rewards/question_recreation_reward_func": 0.6452508568763733, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.7077187523245811, |
|
"step": 6 |
|
}, |
|
{ |
|
"completion_length": 300.34375, |
|
"epoch": 0.45714285714285713, |
|
"grad_norm": 2.2753820419311523, |
|
"kl": 0.0009206273971358314, |
|
"learning_rate": 4.979050253066063e-07, |
|
"loss": 0.0, |
|
"reward": 4.944724701344967, |
|
"reward_std": 4.053568044560961, |
|
"rewards/concensus_correctness_reward_func": 2.3233749866485596, |
|
"rewards/consensus_reward_func": 0.4375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.875, |
|
"rewards/question_recreation_reward_func": 0.6202871967107058, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.6416875068098307, |
|
"step": 8 |
|
}, |
|
{ |
|
"completion_length": 251.34375, |
|
"epoch": 0.5714285714285714, |
|
"grad_norm": 3.361114263534546, |
|
"kl": 0.001550516844872618, |
|
"learning_rate": 4.952945442245597e-07, |
|
"loss": 0.0, |
|
"reward": 4.728278212249279, |
|
"reward_std": 4.151839345460758, |
|
"rewards/concensus_correctness_reward_func": 2.2346875024959445, |
|
"rewards/consensus_reward_func": 0.75, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.5, |
|
"rewards/question_recreation_reward_func": 0.5102781374007463, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.6864375146105886, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 283.0625, |
|
"epoch": 0.6857142857142857, |
|
"grad_norm": 2.796189308166504, |
|
"kl": 0.0018549648048065137, |
|
"learning_rate": 4.916552125781528e-07, |
|
"loss": 0.0, |
|
"reward": 5.428475089371204, |
|
"reward_std": 2.1235571010038257, |
|
"rewards/concensus_correctness_reward_func": 2.752250012010336, |
|
"rewards/consensus_reward_func": 0.625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.75, |
|
"rewards/question_recreation_reward_func": 0.5671625286340714, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.6871875002980232, |
|
"step": 12 |
|
}, |
|
{ |
|
"completion_length": 292.625, |
|
"epoch": 0.8, |
|
"grad_norm": 2.12504243850708, |
|
"kl": 0.002416994764644187, |
|
"learning_rate": 4.870022949890676e-07, |
|
"loss": 0.0, |
|
"reward": 3.0031582564115524, |
|
"reward_std": 1.936399682686897, |
|
"rewards/concensus_correctness_reward_func": 0.9825624963268638, |
|
"rewards/consensus_reward_func": 0.3125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.375, |
|
"rewards/question_recreation_reward_func": 0.52943952428177, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.7880312651395798, |
|
"step": 14 |
|
}, |
|
{ |
|
"completion_length": 236.5, |
|
"epoch": 0.9142857142857143, |
|
"grad_norm": 3.788289785385132, |
|
"kl": 0.004197390335320961, |
|
"learning_rate": 4.81355307410676e-07, |
|
"loss": 0.0, |
|
"reward": 4.460998922586441, |
|
"reward_std": 3.673946577589959, |
|
"rewards/concensus_correctness_reward_func": 2.212312502786517, |
|
"rewards/consensus_reward_func": 0.5, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.4375, |
|
"rewards/question_recreation_reward_func": 0.5744363954290748, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.6742500034160912, |
|
"step": 16 |
|
}, |
|
{ |
|
"completion_length": 223.5, |
|
"epoch": 1.0, |
|
"grad_norm": 2.340423345565796, |
|
"kl": 0.00462426839900824, |
|
"learning_rate": 4.747379352713488e-07, |
|
"loss": 0.0, |
|
"reward": 5.577042788267136, |
|
"reward_std": 2.891117551790861, |
|
"rewards/concensus_correctness_reward_func": 3.5999166841308274, |
|
"rewards/consensus_reward_func": 0.3333333333333333, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.3333333333333333, |
|
"rewards/question_recreation_reward_func": 0.5223344924549261, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.10416666666666667, |
|
"rewards/xmlcount_reward_func": 0.6839583379526933, |
|
"step": 18 |
|
}, |
|
{ |
|
"completion_length": 285.375, |
|
"epoch": 1.1142857142857143, |
|
"grad_norm": 2.9781136512756348, |
|
"kl": 0.005876571987755597, |
|
"learning_rate": 4.6717793412953776e-07, |
|
"loss": 0.0, |
|
"reward": 3.856148846447468, |
|
"reward_std": 3.056236045435071, |
|
"rewards/concensus_correctness_reward_func": 1.7554374812170863, |
|
"rewards/consensus_reward_func": 0.4375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.4375, |
|
"rewards/question_recreation_reward_func": 0.5816801311448216, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.03125, |
|
"rewards/xmlcount_reward_func": 0.6127812552731484, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 258.78125, |
|
"epoch": 1.2285714285714286, |
|
"grad_norm": 2.790374279022217, |
|
"kl": 0.008101415849523619, |
|
"learning_rate": 4.5870701325731773e-07, |
|
"loss": 0.0, |
|
"reward": 5.01688564568758, |
|
"reward_std": 2.427376964595169, |
|
"rewards/concensus_correctness_reward_func": 2.506874994840473, |
|
"rewards/consensus_reward_func": 0.4375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.875, |
|
"rewards/question_recreation_reward_func": 0.5136668155901134, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.6682187579572201, |
|
"step": 22 |
|
}, |
|
{ |
|
"completion_length": 240.25, |
|
"epoch": 1.342857142857143, |
|
"grad_norm": 2.5987563133239746, |
|
"kl": 0.012570352992042899, |
|
"learning_rate": 4.4936070264068016e-07, |
|
"loss": 0.0, |
|
"reward": 4.539519101381302, |
|
"reward_std": 2.6822728496044874, |
|
"rewards/concensus_correctness_reward_func": 1.7519375048577785, |
|
"rewards/consensus_reward_func": 0.9375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.5625, |
|
"rewards/question_recreation_reward_func": 0.5148315682308748, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.7258749920874834, |
|
"step": 24 |
|
}, |
|
{ |
|
"completion_length": 297.03125, |
|
"epoch": 1.457142857142857, |
|
"grad_norm": 2.034766912460327, |
|
"kl": 0.010542554169660434, |
|
"learning_rate": 4.391782039544238e-07, |
|
"loss": 0.0, |
|
"reward": 5.3825334794819355, |
|
"reward_std": 4.827194595243782, |
|
"rewards/concensus_correctness_reward_func": 2.6841250059515005, |
|
"rewards/consensus_reward_func": 0.5625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.6875, |
|
"rewards/question_recreation_reward_func": 0.7439396986737847, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.6419687559828162, |
|
"step": 26 |
|
}, |
|
{ |
|
"completion_length": 282.1875, |
|
"epoch": 1.5714285714285714, |
|
"grad_norm": 4.419146537780762, |
|
"kl": 0.010264099051710218, |
|
"learning_rate": 4.282022261367073e-07, |
|
"loss": 0.0, |
|
"reward": 2.5538329035043716, |
|
"reward_std": 0.86597695434466, |
|
"rewards/concensus_correctness_reward_func": 0.33731249440461397, |
|
"rewards/consensus_reward_func": 0.6875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.25, |
|
"rewards/question_recreation_reward_func": 0.6221455032937229, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.6099999930593185, |
|
"step": 28 |
|
}, |
|
{ |
|
"completion_length": 285.875, |
|
"epoch": 1.6857142857142857, |
|
"grad_norm": 2.3088910579681396, |
|
"kl": 0.014536559290718287, |
|
"learning_rate": 4.1647880625292027e-07, |
|
"loss": 0.0, |
|
"reward": 7.595606815069914, |
|
"reward_std": 2.5936438450589776, |
|
"rewards/concensus_correctness_reward_func": 4.180062495172024, |
|
"rewards/consensus_reward_func": 1.125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.8125, |
|
"rewards/question_recreation_reward_func": 0.7352005220018327, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.6959687564522028, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 260.65625, |
|
"epoch": 1.8, |
|
"grad_norm": 2.6501407623291016, |
|
"kl": 0.018609989958349615, |
|
"learning_rate": 4.040571164002318e-07, |
|
"loss": 0.0, |
|
"reward": 4.827333331108093, |
|
"reward_std": 2.085965577978641, |
|
"rewards/concensus_correctness_reward_func": 2.4736250173300505, |
|
"rewards/consensus_reward_func": 0.4375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.6875, |
|
"rewards/question_recreation_reward_func": 0.5945833660662174, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.03125, |
|
"rewards/xmlcount_reward_func": 0.6028750017285347, |
|
"step": 32 |
|
}, |
|
{ |
|
"completion_length": 262.71875, |
|
"epoch": 1.9142857142857141, |
|
"grad_norm": 3.4099960327148438, |
|
"kl": 0.018288226914592087, |
|
"learning_rate": 3.909892574627266e-07, |
|
"loss": 0.0, |
|
"reward": 5.539210200309753, |
|
"reward_std": 2.2049794927006587, |
|
"rewards/concensus_correctness_reward_func": 2.6087499796412885, |
|
"rewards/consensus_reward_func": 0.8125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.5625, |
|
"rewards/question_recreation_reward_func": 0.6833352446556091, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.8096250100061297, |
|
"step": 34 |
|
}, |
|
{ |
|
"completion_length": 315.9166666666667, |
|
"epoch": 2.0, |
|
"grad_norm": 1.4127824306488037, |
|
"kl": 0.0216289390809834, |
|
"learning_rate": 3.773300405821908e-07, |
|
"loss": 0.0, |
|
"reward": 3.2096741100152335, |
|
"reward_std": 1.8589469492435455, |
|
"rewards/concensus_correctness_reward_func": 0.7585000023245811, |
|
"rewards/consensus_reward_func": 0.4166666666666667, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.6666666666666666, |
|
"rewards/question_recreation_reward_func": 0.6264241177899142, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.6789166710029045, |
|
"step": 36 |
|
}, |
|
{ |
|
"completion_length": 235.25, |
|
"epoch": 2.1142857142857143, |
|
"grad_norm": 2.856210470199585, |
|
"kl": 0.03234067652374506, |
|
"learning_rate": 3.6313675726113475e-07, |
|
"loss": 0.0, |
|
"reward": 4.8823426477611065, |
|
"reward_std": 4.0293696410954, |
|
"rewards/concensus_correctness_reward_func": 2.1174999997019768, |
|
"rewards/consensus_reward_func": 0.625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.625, |
|
"rewards/question_recreation_reward_func": 0.6155301326652989, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.09375, |
|
"rewards/xmlcount_reward_func": 0.8055625003762543, |
|
"step": 38 |
|
}, |
|
{ |
|
"completion_length": 289.0, |
|
"epoch": 2.2285714285714286, |
|
"grad_norm": 2.498208999633789, |
|
"kl": 0.032537119346670806, |
|
"learning_rate": 3.484689390623218e-07, |
|
"loss": 0.0, |
|
"reward": 3.257258500903845, |
|
"reward_std": 1.8479195050895214, |
|
"rewards/concensus_correctness_reward_func": 0.9986874996393453, |
|
"rewards/consensus_reward_func": 0.75, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.25, |
|
"rewards/question_recreation_reward_func": 0.6424147803336382, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.03125, |
|
"rewards/xmlcount_reward_func": 0.5849062511697412, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 286.0, |
|
"epoch": 2.342857142857143, |
|
"grad_norm": 3.0837841033935547, |
|
"kl": 0.03241805831203237, |
|
"learning_rate": 3.3338810791270517e-07, |
|
"loss": 0.0, |
|
"reward": 6.6657252591103315, |
|
"reward_std": 5.738411407917738, |
|
"rewards/concensus_correctness_reward_func": 4.001499989069998, |
|
"rewards/consensus_reward_func": 0.5625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.75, |
|
"rewards/question_recreation_reward_func": 0.6340690106153488, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.6551562692038715, |
|
"step": 42 |
|
}, |
|
{ |
|
"completion_length": 240.5, |
|
"epoch": 2.4571428571428573, |
|
"grad_norm": 2.2751593589782715, |
|
"kl": 0.027716133918147534, |
|
"learning_rate": 3.179575180590857e-07, |
|
"loss": 0.0, |
|
"reward": 3.8581665493547916, |
|
"reward_std": 1.8522115424275398, |
|
"rewards/concensus_correctness_reward_func": 1.1526875039562583, |
|
"rewards/consensus_reward_func": 0.5, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.5625, |
|
"rewards/question_recreation_reward_func": 0.5496665136888623, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.109375, |
|
"rewards/xmlcount_reward_func": 0.9839375028386712, |
|
"step": 44 |
|
}, |
|
{ |
|
"completion_length": 287.8125, |
|
"epoch": 2.571428571428571, |
|
"grad_norm": 2.325424909591675, |
|
"kl": 0.03136032959446311, |
|
"learning_rate": 3.022418907578188e-07, |
|
"loss": 0.0, |
|
"reward": 5.811520978808403, |
|
"reward_std": 2.1761377695947886, |
|
"rewards/concensus_correctness_reward_func": 3.5759375113993883, |
|
"rewards/consensus_reward_func": 0.5625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.375, |
|
"rewards/question_recreation_reward_func": 0.5898647699505091, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.6925937533378601, |
|
"step": 46 |
|
}, |
|
{ |
|
"completion_length": 264.09375, |
|
"epoch": 2.685714285714286, |
|
"grad_norm": 2.799055814743042, |
|
"kl": 0.035045830823946744, |
|
"learning_rate": 2.863071428113726e-07, |
|
"loss": 0.0, |
|
"reward": 5.103824369609356, |
|
"reward_std": 3.5301670129410923, |
|
"rewards/concensus_correctness_reward_func": 2.3825000133365393, |
|
"rewards/consensus_reward_func": 0.8125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.625, |
|
"rewards/question_recreation_reward_func": 0.5504180546849966, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.6865312550216913, |
|
"step": 48 |
|
}, |
|
{ |
|
"completion_length": 280.375, |
|
"epoch": 2.8, |
|
"grad_norm": 2.319396495819092, |
|
"kl": 0.033160059072542936, |
|
"learning_rate": 2.7022011009035107e-07, |
|
"loss": 0.0, |
|
"reward": 4.136773347854614, |
|
"reward_std": 1.7412771796807647, |
|
"rewards/concensus_correctness_reward_func": 1.2328750090673566, |
|
"rewards/consensus_reward_func": 0.75, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.625, |
|
"rewards/question_recreation_reward_func": 0.7363045308738947, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.09375, |
|
"rewards/xmlcount_reward_func": 0.6988437669351697, |
|
"step": 50 |
|
}, |
|
{ |
|
"completion_length": 252.5, |
|
"epoch": 2.914285714285714, |
|
"grad_norm": 2.2659144401550293, |
|
"kl": 0.08391272573499009, |
|
"learning_rate": 2.540482672006254e-07, |
|
"loss": 0.0001, |
|
"reward": 5.248130708932877, |
|
"reward_std": 2.9013717267662287, |
|
"rewards/concensus_correctness_reward_func": 2.086687508970499, |
|
"rewards/consensus_reward_func": 0.875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.75, |
|
"rewards/question_recreation_reward_func": 0.6195994764566422, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.125, |
|
"rewards/xmlcount_reward_func": 0.7918437521439046, |
|
"step": 52 |
|
}, |
|
{ |
|
"completion_length": 336.3333333333333, |
|
"epoch": 3.0, |
|
"grad_norm": 1.9638556241989136, |
|
"kl": 0.0329820365489771, |
|
"learning_rate": 2.37859444471388e-07, |
|
"loss": 0.0, |
|
"reward": 2.897741069396337, |
|
"reward_std": 1.3238216874500115, |
|
"rewards/concensus_correctness_reward_func": 0.4622500070060293, |
|
"rewards/consensus_reward_func": 0.5, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.5833333333333334, |
|
"rewards/question_recreation_reward_func": 0.7555743406216303, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.5965833238636454, |
|
"step": 54 |
|
}, |
|
{ |
|
"completion_length": 297.90625, |
|
"epoch": 3.1142857142857143, |
|
"grad_norm": 2.512716770172119, |
|
"kl": 0.07957816089037806, |
|
"learning_rate": 2.2172154345117894e-07, |
|
"loss": 0.0001, |
|
"reward": 5.407194800674915, |
|
"reward_std": 2.5112848294666037, |
|
"rewards/concensus_correctness_reward_func": 2.6179999876767397, |
|
"rewards/consensus_reward_func": 1.0625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.625, |
|
"rewards/question_recreation_reward_func": 0.6985698798671365, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.3562500039115548, |
|
"step": 56 |
|
}, |
|
{ |
|
"completion_length": 261.375, |
|
"epoch": 3.2285714285714286, |
|
"grad_norm": 2.727494716644287, |
|
"kl": 0.051797536259982735, |
|
"learning_rate": 2.0570225210519433e-07, |
|
"loss": 0.0001, |
|
"reward": 4.303291346877813, |
|
"reward_std": 3.120794242247939, |
|
"rewards/concensus_correctness_reward_func": 1.5453749848529696, |
|
"rewards/consensus_reward_func": 0.625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.8125, |
|
"rewards/question_recreation_reward_func": 0.5874788034707308, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.078125, |
|
"rewards/xmlcount_reward_func": 0.6548125031404197, |
|
"step": 58 |
|
}, |
|
{ |
|
"completion_length": 312.5, |
|
"epoch": 3.342857142857143, |
|
"grad_norm": 2.5867483615875244, |
|
"kl": 0.045177310064900666, |
|
"learning_rate": 1.8986876090843664e-07, |
|
"loss": 0.0, |
|
"reward": 6.679476020857692, |
|
"reward_std": 6.590652715298347, |
|
"rewards/concensus_correctness_reward_func": 3.911812473088503, |
|
"rewards/consensus_reward_func": 0.625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.8125, |
|
"rewards/question_recreation_reward_func": 0.6978510078042746, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.5698124994523823, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 275.46875, |
|
"epoch": 3.4571428571428573, |
|
"grad_norm": 2.2337958812713623, |
|
"kl": 0.05546386865898967, |
|
"learning_rate": 1.7428748102551234e-07, |
|
"loss": 0.0001, |
|
"reward": 4.224702462553978, |
|
"reward_std": 2.568043567443965, |
|
"rewards/concensus_correctness_reward_func": 1.6509375016321428, |
|
"rewards/consensus_reward_func": 0.5625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.5, |
|
"rewards/question_recreation_reward_func": 0.5963900072965771, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.8680000007152557, |
|
"step": 62 |
|
}, |
|
{ |
|
"completion_length": 254.375, |
|
"epoch": 3.571428571428571, |
|
"grad_norm": 2.5836949348449707, |
|
"kl": 0.06771399604622275, |
|
"learning_rate": 1.5902376575912814e-07, |
|
"loss": 0.0001, |
|
"reward": 5.309353556483984, |
|
"reward_std": 2.0958344470709562, |
|
"rewards/concensus_correctness_reward_func": 2.4754375047050416, |
|
"rewards/consensus_reward_func": 0.9375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.5625, |
|
"rewards/question_recreation_reward_func": 0.5066348570398986, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.09375, |
|
"rewards/xmlcount_reward_func": 0.7335312599316239, |
|
"step": 64 |
|
}, |
|
{ |
|
"completion_length": 294.90625, |
|
"epoch": 3.685714285714286, |
|
"grad_norm": 2.2413110733032227, |
|
"kl": 0.058190350187942386, |
|
"learning_rate": 1.4414163643562753e-07, |
|
"loss": 0.0001, |
|
"reward": 4.059148486703634, |
|
"reward_std": 2.2921111752657453, |
|
"rewards/concensus_correctness_reward_func": 1.5227500088512897, |
|
"rewards/consensus_reward_func": 0.4375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.5625, |
|
"rewards/question_recreation_reward_func": 0.7638359684497118, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.7569375038146973, |
|
"step": 66 |
|
}, |
|
{ |
|
"completion_length": 277.53125, |
|
"epoch": 3.8, |
|
"grad_norm": 2.7358996868133545, |
|
"kl": 0.07721107231918722, |
|
"learning_rate": 1.2970351387729872e-07, |
|
"loss": 0.0001, |
|
"reward": 3.4795289039611816, |
|
"reward_std": 2.1741816513240337, |
|
"rewards/concensus_correctness_reward_func": 0.8386874985590111, |
|
"rewards/consensus_reward_func": 0.75, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.25, |
|
"rewards/question_recreation_reward_func": 0.6809976994991302, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.078125, |
|
"rewards/xmlcount_reward_func": 0.8817187771201134, |
|
"step": 68 |
|
}, |
|
{ |
|
"completion_length": 256.0625, |
|
"epoch": 3.914285714285714, |
|
"grad_norm": 4.087591648101807, |
|
"kl": 0.0788359681610018, |
|
"learning_rate": 1.1576995658775404e-07, |
|
"loss": 0.0001, |
|
"reward": 6.083272695541382, |
|
"reward_std": 4.112470694584772, |
|
"rewards/concensus_correctness_reward_func": 3.4234374810475856, |
|
"rewards/consensus_reward_func": 0.875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.3125, |
|
"rewards/question_recreation_reward_func": 0.5136476922780275, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.125, |
|
"rewards/xmlcount_reward_func": 0.8336875168606639, |
|
"step": 70 |
|
}, |
|
{ |
|
"completion_length": 276.5, |
|
"epoch": 4.0, |
|
"grad_norm": 1.627131700515747, |
|
"kl": 0.11795352476959427, |
|
"learning_rate": 1.0239940674851941e-07, |
|
"loss": 0.0001, |
|
"reward": 5.3608784476916, |
|
"reward_std": 2.300887676576773, |
|
"rewards/concensus_correctness_reward_func": 1.9943333491683006, |
|
"rewards/consensus_reward_func": 1.0833333333333333, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.75, |
|
"rewards/question_recreation_reward_func": 0.6572117364654938, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.08333333333333333, |
|
"rewards/xmlcount_reward_func": 0.7926666811108589, |
|
"step": 72 |
|
}, |
|
{ |
|
"completion_length": 353.4375, |
|
"epoch": 4.114285714285714, |
|
"grad_norm": 4.412367820739746, |
|
"kl": 0.08782886900007725, |
|
"learning_rate": 8.964794509221507e-08, |
|
"loss": 0.0001, |
|
"reward": 4.319345578551292, |
|
"reward_std": 1.4791212249547243, |
|
"rewards/concensus_correctness_reward_func": 1.8143750003073364, |
|
"rewards/consensus_reward_func": 0.625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.4375, |
|
"rewards/question_recreation_reward_func": 0.6734706219285727, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.706500010099262, |
|
"step": 74 |
|
}, |
|
{ |
|
"completion_length": 298.5, |
|
"epoch": 4.228571428571429, |
|
"grad_norm": 43.921119689941406, |
|
"kl": 0.09398894105106592, |
|
"learning_rate": 7.756905568047392e-08, |
|
"loss": 0.0001, |
|
"reward": 5.068341612815857, |
|
"reward_std": 2.5960501823574305, |
|
"rewards/concensus_correctness_reward_func": 2.2858750016748672, |
|
"rewards/consensus_reward_func": 0.8125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.6875, |
|
"rewards/question_recreation_reward_func": 0.6213102764450014, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.6142812587786466, |
|
"step": 76 |
|
}, |
|
{ |
|
"completion_length": 279.4375, |
|
"epoch": 4.3428571428571425, |
|
"grad_norm": 3.3324248790740967, |
|
"kl": 0.12617466738447547, |
|
"learning_rate": 6.621340157319996e-08, |
|
"loss": 0.0001, |
|
"reward": 6.855641521513462, |
|
"reward_std": 5.589063869789243, |
|
"rewards/concensus_correctness_reward_func": 3.5954999728128314, |
|
"rewards/consensus_reward_func": 1.125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.6875, |
|
"rewards/question_recreation_reward_func": 0.6044852556660771, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.078125, |
|
"rewards/xmlcount_reward_func": 0.7650312539190054, |
|
"step": 78 |
|
}, |
|
{ |
|
"completion_length": 282.25, |
|
"epoch": 4.457142857142857, |
|
"grad_norm": 2.391388416290283, |
|
"kl": 0.0781217070762068, |
|
"learning_rate": 5.5628612330087724e-08, |
|
"loss": 0.0001, |
|
"reward": 5.143940486013889, |
|
"reward_std": 3.1008094910066575, |
|
"rewards/concensus_correctness_reward_func": 2.1685625007376075, |
|
"rewards/consensus_reward_func": 0.6875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.8125, |
|
"rewards/question_recreation_reward_func": 0.5825029462575912, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.109375, |
|
"rewards/xmlcount_reward_func": 0.7835000064224005, |
|
"step": 80 |
|
}, |
|
{ |
|
"completion_length": 250.875, |
|
"epoch": 4.571428571428571, |
|
"grad_norm": 3.275106906890869, |
|
"kl": 0.08860545780044049, |
|
"learning_rate": 4.5859084235697235e-08, |
|
"loss": 0.0001, |
|
"reward": 3.176556244492531, |
|
"reward_std": 1.5254050176981764, |
|
"rewards/concensus_correctness_reward_func": 0.8501874984940514, |
|
"rewards/consensus_reward_func": 0.5625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.125, |
|
"rewards/question_recreation_reward_func": 0.5377749832696281, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.109375, |
|
"rewards/xmlcount_reward_func": 0.9917187504470348, |
|
"step": 82 |
|
}, |
|
{ |
|
"completion_length": 252.65625, |
|
"epoch": 4.685714285714286, |
|
"grad_norm": 5.577023029327393, |
|
"kl": 0.14010742434766144, |
|
"learning_rate": 3.6945794086007705e-08, |
|
"loss": 0.0001, |
|
"reward": 4.1389394868165255, |
|
"reward_std": 2.314908188767731, |
|
"rewards/concensus_correctness_reward_func": 1.5294374911900377, |
|
"rewards/consensus_reward_func": 0.6875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.625, |
|
"rewards/question_recreation_reward_func": 0.560220692306757, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.078125, |
|
"rewards/xmlcount_reward_func": 0.6586562575539574, |
|
"step": 84 |
|
}, |
|
{ |
|
"completion_length": 257.34375, |
|
"epoch": 4.8, |
|
"grad_norm": 2.5406899452209473, |
|
"kl": 0.09263441083021462, |
|
"learning_rate": 2.892612731749414e-08, |
|
"loss": 0.0001, |
|
"reward": 3.948366153985262, |
|
"reward_std": 1.6628333161497721, |
|
"rewards/concensus_correctness_reward_func": 1.1678125290200114, |
|
"rewards/consensus_reward_func": 0.5625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.6875, |
|
"rewards/question_recreation_reward_func": 0.6303973635658622, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.09375, |
|
"rewards/xmlcount_reward_func": 0.8064062613993883, |
|
"step": 86 |
|
}, |
|
{ |
|
"completion_length": 266.59375, |
|
"epoch": 4.914285714285715, |
|
"grad_norm": 2.87119197845459, |
|
"kl": 0.11624281201511621, |
|
"learning_rate": 2.183372119961499e-08, |
|
"loss": 0.0001, |
|
"reward": 4.197016902267933, |
|
"reward_std": 2.4855765970423818, |
|
"rewards/concensus_correctness_reward_func": 1.5803749952465296, |
|
"rewards/consensus_reward_func": 0.4375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.5, |
|
"rewards/question_recreation_reward_func": 0.6657043690793216, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.078125, |
|
"rewards/xmlcount_reward_func": 0.9353125058114529, |
|
"step": 88 |
|
}, |
|
{ |
|
"completion_length": 259.5833333333333, |
|
"epoch": 5.0, |
|
"grad_norm": 3.3491158485412598, |
|
"kl": 0.16860986345758042, |
|
"learning_rate": 1.5698323748414122e-08, |
|
"loss": 0.0001, |
|
"reward": 7.5917567908763885, |
|
"reward_std": 5.551452632372578, |
|
"rewards/concensus_correctness_reward_func": 4.24174995906651, |
|
"rewards/consensus_reward_func": 0.8333333333333334, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.8333333333333334, |
|
"rewards/question_recreation_reward_func": 0.7116317736605803, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.041666666666666664, |
|
"rewards/xmlcount_reward_func": 0.9300416857004166, |
|
"step": 90 |
|
}, |
|
{ |
|
"completion_length": 259.78125, |
|
"epoch": 5.114285714285714, |
|
"grad_norm": 4.982123851776123, |
|
"kl": 0.2259263969026506, |
|
"learning_rate": 1.054566895300324e-08, |
|
"loss": 0.0002, |
|
"reward": 4.731869850307703, |
|
"reward_std": 3.6210765979485586, |
|
"rewards/concensus_correctness_reward_func": 1.8394375070929527, |
|
"rewards/consensus_reward_func": 0.75, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.625, |
|
"rewards/question_recreation_reward_func": 0.6873698411509395, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.7831875099800527, |
|
"step": 92 |
|
}, |
|
{ |
|
"completion_length": 309.03125, |
|
"epoch": 5.228571428571429, |
|
"grad_norm": 2.2618565559387207, |
|
"kl": 0.2819003712502308, |
|
"learning_rate": 6.397368838268496e-09, |
|
"loss": 0.0003, |
|
"reward": 3.303976181894541, |
|
"reward_std": 2.2868103915825486, |
|
"rewards/concensus_correctness_reward_func": 0.8359999973326921, |
|
"rewards/consensus_reward_func": 0.8125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.25, |
|
"rewards/question_recreation_reward_func": 0.6375387134030461, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.09375, |
|
"rewards/xmlcount_reward_func": 0.6741875065490603, |
|
"step": 94 |
|
}, |
|
{ |
|
"completion_length": 263.03125, |
|
"epoch": 5.3428571428571425, |
|
"grad_norm": 2.844130039215088, |
|
"kl": 0.059587346273474395, |
|
"learning_rate": 3.2708228165273244e-09, |
|
"loss": 0.0001, |
|
"reward": 5.61201386898756, |
|
"reward_std": 4.463006908656098, |
|
"rewards/concensus_correctness_reward_func": 2.957062483765185, |
|
"rewards/consensus_reward_func": 0.625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.625, |
|
"rewards/question_recreation_reward_func": 0.5966701377183199, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.09375, |
|
"rewards/xmlcount_reward_func": 0.7145312689244747, |
|
"step": 96 |
|
}, |
|
{ |
|
"completion_length": 250.0625, |
|
"epoch": 5.457142857142857, |
|
"grad_norm": 2.9318857192993164, |
|
"kl": 0.06650862575042993, |
|
"learning_rate": 1.1791447083465133e-09, |
|
"loss": 0.0001, |
|
"reward": 5.33496169000864, |
|
"reward_std": 1.7955414667958394, |
|
"rewards/concensus_correctness_reward_func": 2.302812503403402, |
|
"rewards/consensus_reward_func": 0.625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.875, |
|
"rewards/question_recreation_reward_func": 0.6075867600739002, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.8620625035837293, |
|
"step": 98 |
|
}, |
|
{ |
|
"completion_length": 270.4375, |
|
"epoch": 5.571428571428571, |
|
"grad_norm": 2.9673755168914795, |
|
"kl": 0.08620016509667039, |
|
"learning_rate": 1.3110773862126667e-10, |
|
"loss": 0.0001, |
|
"reward": 4.476045485585928, |
|
"reward_std": 3.1387285026721656, |
|
"rewards/concensus_correctness_reward_func": 1.7854374921880662, |
|
"rewards/consensus_reward_func": 0.5, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.5625, |
|
"rewards/question_recreation_reward_func": 0.606795561965555, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.109375, |
|
"rewards/xmlcount_reward_func": 0.9119375087320805, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 5.571428571428571, |
|
"step": 100, |
|
"total_flos": 0.0, |
|
"train_loss": 5.454165089759044e-05, |
|
"train_runtime": 1795.073, |
|
"train_samples_per_second": 0.891, |
|
"train_steps_per_second": 0.056 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 100, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 6, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|