{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.571428571428571, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 326.21875, "epoch": 0.11428571428571428, "grad_norm": 2.7555673122406006, "kl": 0.0, "learning_rate": 1.6666666666666665e-07, "loss": -0.0, "reward": 3.2572884149849415, "reward_std": 2.711772508919239, "rewards/concensus_correctness_reward_func": 1.3658749970927602, "rewards/consensus_reward_func": 0.5625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.676913361530751, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.046875, "rewards/xmlcount_reward_func": 0.48012501280754805, "step": 2 }, { "completion_length": 223.25, "epoch": 0.22857142857142856, "grad_norm": 2.6882662773132324, "kl": 0.0006823752573836828, "learning_rate": 5e-07, "loss": 0.0, "reward": 6.5029780976474285, "reward_std": 3.8371810587123036, "rewards/concensus_correctness_reward_func": 3.9408124699257314, "rewards/consensus_reward_func": 0.5625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.6875, "rewards/question_recreation_reward_func": 0.4921030206605792, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0625, "rewards/xmlcount_reward_func": 0.7575625074096024, "step": 4 }, { "completion_length": 327.5625, "epoch": 0.34285714285714286, "grad_norm": 2.6698217391967773, "kl": 0.0007466921615559841, "learning_rate": 4.994757065594279e-07, "loss": 0.0, "reward": 3.1852196622639894, "reward_std": 1.617547769099474, "rewards/concensus_correctness_reward_func": 0.972874996252358, "rewards/consensus_reward_func": 0.4375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.375, "rewards/question_recreation_reward_func": 0.6452508568763733, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.046875, "rewards/xmlcount_reward_func": 0.7077187523245811, "step": 6 }, { "completion_length": 300.34375, "epoch": 0.45714285714285713, "grad_norm": 2.2753820419311523, "kl": 0.0009206273971358314, "learning_rate": 4.979050253066063e-07, "loss": 0.0, "reward": 4.944724701344967, "reward_std": 4.053568044560961, "rewards/concensus_correctness_reward_func": 2.3233749866485596, "rewards/consensus_reward_func": 0.4375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.875, "rewards/question_recreation_reward_func": 0.6202871967107058, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.046875, "rewards/xmlcount_reward_func": 0.6416875068098307, "step": 8 }, { "completion_length": 251.34375, "epoch": 0.5714285714285714, "grad_norm": 3.361114263534546, "kl": 0.001550516844872618, "learning_rate": 4.952945442245597e-07, "loss": 0.0, "reward": 4.728278212249279, "reward_std": 4.151839345460758, "rewards/concensus_correctness_reward_func": 2.2346875024959445, "rewards/consensus_reward_func": 0.75, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.5, "rewards/question_recreation_reward_func": 0.5102781374007463, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.046875, "rewards/xmlcount_reward_func": 0.6864375146105886, "step": 10 }, { "completion_length": 283.0625, "epoch": 0.6857142857142857, "grad_norm": 2.796189308166504, "kl": 0.0018549648048065137, "learning_rate": 4.916552125781528e-07, "loss": 0.0, "reward": 5.428475089371204, "reward_std": 2.1235571010038257, "rewards/concensus_correctness_reward_func": 2.752250012010336, "rewards/consensus_reward_func": 0.625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.75, "rewards/question_recreation_reward_func": 0.5671625286340714, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.046875, "rewards/xmlcount_reward_func": 0.6871875002980232, "step": 12 }, { "completion_length": 292.625, "epoch": 0.8, "grad_norm": 2.12504243850708, "kl": 0.002416994764644187, "learning_rate": 4.870022949890676e-07, "loss": 0.0, "reward": 3.0031582564115524, "reward_std": 1.936399682686897, "rewards/concensus_correctness_reward_func": 0.9825624963268638, "rewards/consensus_reward_func": 0.3125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.375, "rewards/question_recreation_reward_func": 0.52943952428177, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.7880312651395798, "step": 14 }, { "completion_length": 236.5, "epoch": 0.9142857142857143, "grad_norm": 3.788289785385132, "kl": 0.004197390335320961, "learning_rate": 4.81355307410676e-07, "loss": 0.0, "reward": 4.460998922586441, "reward_std": 3.673946577589959, "rewards/concensus_correctness_reward_func": 2.212312502786517, "rewards/consensus_reward_func": 0.5, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.4375, "rewards/question_recreation_reward_func": 0.5744363954290748, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0625, "rewards/xmlcount_reward_func": 0.6742500034160912, "step": 16 }, { "completion_length": 223.5, "epoch": 1.0, "grad_norm": 2.340423345565796, "kl": 0.00462426839900824, "learning_rate": 4.747379352713488e-07, "loss": 0.0, "reward": 5.577042788267136, "reward_std": 2.891117551790861, "rewards/concensus_correctness_reward_func": 3.5999166841308274, "rewards/consensus_reward_func": 0.3333333333333333, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.3333333333333333, "rewards/question_recreation_reward_func": 0.5223344924549261, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.10416666666666667, "rewards/xmlcount_reward_func": 0.6839583379526933, "step": 18 }, { "completion_length": 285.375, "epoch": 1.1142857142857143, "grad_norm": 2.9781136512756348, "kl": 0.005876571987755597, "learning_rate": 4.6717793412953776e-07, "loss": 0.0, "reward": 3.856148846447468, "reward_std": 3.056236045435071, "rewards/concensus_correctness_reward_func": 1.7554374812170863, "rewards/consensus_reward_func": 0.4375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.4375, "rewards/question_recreation_reward_func": 0.5816801311448216, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": 0.6127812552731484, "step": 20 }, { "completion_length": 258.78125, "epoch": 1.2285714285714286, "grad_norm": 2.790374279022217, "kl": 0.008101415849523619, "learning_rate": 4.5870701325731773e-07, "loss": 0.0, "reward": 5.01688564568758, "reward_std": 2.427376964595169, "rewards/concensus_correctness_reward_func": 2.506874994840473, "rewards/consensus_reward_func": 0.4375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.875, "rewards/question_recreation_reward_func": 0.5136668155901134, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.6682187579572201, "step": 22 }, { "completion_length": 240.25, "epoch": 1.342857142857143, "grad_norm": 2.5987563133239746, "kl": 0.012570352992042899, "learning_rate": 4.4936070264068016e-07, "loss": 0.0, "reward": 4.539519101381302, "reward_std": 2.6822728496044874, "rewards/concensus_correctness_reward_func": 1.7519375048577785, "rewards/consensus_reward_func": 0.9375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.5625, "rewards/question_recreation_reward_func": 0.5148315682308748, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.046875, "rewards/xmlcount_reward_func": 0.7258749920874834, "step": 24 }, { "completion_length": 297.03125, "epoch": 1.457142857142857, "grad_norm": 2.034766912460327, "kl": 0.010542554169660434, "learning_rate": 4.391782039544238e-07, "loss": 0.0, "reward": 5.3825334794819355, "reward_std": 4.827194595243782, "rewards/concensus_correctness_reward_func": 2.6841250059515005, "rewards/consensus_reward_func": 0.5625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.6875, "rewards/question_recreation_reward_func": 0.7439396986737847, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0625, "rewards/xmlcount_reward_func": 0.6419687559828162, "step": 26 }, { "completion_length": 282.1875, "epoch": 1.5714285714285714, "grad_norm": 4.419146537780762, "kl": 0.010264099051710218, "learning_rate": 4.282022261367073e-07, "loss": 0.0, "reward": 2.5538329035043716, "reward_std": 0.86597695434466, "rewards/concensus_correctness_reward_func": 0.33731249440461397, "rewards/consensus_reward_func": 0.6875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.6221455032937229, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.046875, "rewards/xmlcount_reward_func": 0.6099999930593185, "step": 28 }, { "completion_length": 285.875, "epoch": 1.6857142857142857, "grad_norm": 2.3088910579681396, "kl": 0.014536559290718287, "learning_rate": 4.1647880625292027e-07, "loss": 0.0, "reward": 7.595606815069914, "reward_std": 2.5936438450589776, "rewards/concensus_correctness_reward_func": 4.180062495172024, "rewards/consensus_reward_func": 1.125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.8125, "rewards/question_recreation_reward_func": 0.7352005220018327, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.046875, "rewards/xmlcount_reward_func": 0.6959687564522028, "step": 30 }, { "completion_length": 260.65625, "epoch": 1.8, "grad_norm": 2.6501407623291016, "kl": 0.018609989958349615, "learning_rate": 4.040571164002318e-07, "loss": 0.0, "reward": 4.827333331108093, "reward_std": 2.085965577978641, "rewards/concensus_correctness_reward_func": 2.4736250173300505, "rewards/consensus_reward_func": 0.4375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.6875, "rewards/question_recreation_reward_func": 0.5945833660662174, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": 0.6028750017285347, "step": 32 }, { "completion_length": 262.71875, "epoch": 1.9142857142857141, "grad_norm": 3.4099960327148438, "kl": 0.018288226914592087, "learning_rate": 3.909892574627266e-07, "loss": 0.0, "reward": 5.539210200309753, "reward_std": 2.2049794927006587, "rewards/concensus_correctness_reward_func": 2.6087499796412885, "rewards/consensus_reward_func": 0.8125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.5625, "rewards/question_recreation_reward_func": 0.6833352446556091, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0625, "rewards/xmlcount_reward_func": 0.8096250100061297, "step": 34 }, { "completion_length": 315.9166666666667, "epoch": 2.0, "grad_norm": 1.4127824306488037, "kl": 0.0216289390809834, "learning_rate": 3.773300405821908e-07, "loss": 0.0, "reward": 3.2096741100152335, "reward_std": 1.8589469492435455, "rewards/concensus_correctness_reward_func": 0.7585000023245811, "rewards/consensus_reward_func": 0.4166666666666667, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.6666666666666666, "rewards/question_recreation_reward_func": 0.6264241177899142, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0625, "rewards/xmlcount_reward_func": 0.6789166710029045, "step": 36 }, { "completion_length": 235.25, "epoch": 2.1142857142857143, "grad_norm": 2.856210470199585, "kl": 0.03234067652374506, "learning_rate": 3.6313675726113475e-07, "loss": 0.0, "reward": 4.8823426477611065, "reward_std": 4.0293696410954, "rewards/concensus_correctness_reward_func": 2.1174999997019768, "rewards/consensus_reward_func": 0.625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.625, "rewards/question_recreation_reward_func": 0.6155301326652989, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.09375, "rewards/xmlcount_reward_func": 0.8055625003762543, "step": 38 }, { "completion_length": 289.0, "epoch": 2.2285714285714286, "grad_norm": 2.498208999633789, "kl": 0.032537119346670806, "learning_rate": 3.484689390623218e-07, "loss": 0.0, "reward": 3.257258500903845, "reward_std": 1.8479195050895214, "rewards/concensus_correctness_reward_func": 0.9986874996393453, "rewards/consensus_reward_func": 0.75, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.6424147803336382, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": 0.5849062511697412, "step": 40 }, { "completion_length": 286.0, "epoch": 2.342857142857143, "grad_norm": 3.0837841033935547, "kl": 0.03241805831203237, "learning_rate": 3.3338810791270517e-07, "loss": 0.0, "reward": 6.6657252591103315, "reward_std": 5.738411407917738, "rewards/concensus_correctness_reward_func": 4.001499989069998, "rewards/consensus_reward_func": 0.5625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.75, "rewards/question_recreation_reward_func": 0.6340690106153488, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0625, "rewards/xmlcount_reward_func": 0.6551562692038715, "step": 42 }, { "completion_length": 240.5, "epoch": 2.4571428571428573, "grad_norm": 2.2751593589782715, "kl": 0.027716133918147534, "learning_rate": 3.179575180590857e-07, "loss": 0.0, "reward": 3.8581665493547916, "reward_std": 1.8522115424275398, "rewards/concensus_correctness_reward_func": 1.1526875039562583, "rewards/consensus_reward_func": 0.5, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.5625, "rewards/question_recreation_reward_func": 0.5496665136888623, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.109375, "rewards/xmlcount_reward_func": 0.9839375028386712, "step": 44 }, { "completion_length": 287.8125, "epoch": 2.571428571428571, "grad_norm": 2.325424909591675, "kl": 0.03136032959446311, "learning_rate": 3.022418907578188e-07, "loss": 0.0, "reward": 5.811520978808403, "reward_std": 2.1761377695947886, "rewards/concensus_correctness_reward_func": 3.5759375113993883, "rewards/consensus_reward_func": 0.5625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.375, "rewards/question_recreation_reward_func": 0.5898647699505091, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.6925937533378601, "step": 46 }, { "completion_length": 264.09375, "epoch": 2.685714285714286, "grad_norm": 2.799055814743042, "kl": 0.035045830823946744, "learning_rate": 2.863071428113726e-07, "loss": 0.0, "reward": 5.103824369609356, "reward_std": 3.5301670129410923, "rewards/concensus_correctness_reward_func": 2.3825000133365393, "rewards/consensus_reward_func": 0.8125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.625, "rewards/question_recreation_reward_func": 0.5504180546849966, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.046875, "rewards/xmlcount_reward_func": 0.6865312550216913, "step": 48 }, { "completion_length": 280.375, "epoch": 2.8, "grad_norm": 2.319396495819092, "kl": 0.033160059072542936, "learning_rate": 2.7022011009035107e-07, "loss": 0.0, "reward": 4.136773347854614, "reward_std": 1.7412771796807647, "rewards/concensus_correctness_reward_func": 1.2328750090673566, "rewards/consensus_reward_func": 0.75, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.625, "rewards/question_recreation_reward_func": 0.7363045308738947, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.09375, "rewards/xmlcount_reward_func": 0.6988437669351697, "step": 50 }, { "completion_length": 252.5, "epoch": 2.914285714285714, "grad_norm": 2.2659144401550293, "kl": 0.08391272573499009, "learning_rate": 2.540482672006254e-07, "loss": 0.0001, "reward": 5.248130708932877, "reward_std": 2.9013717267662287, "rewards/concensus_correctness_reward_func": 2.086687508970499, "rewards/consensus_reward_func": 0.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.75, "rewards/question_recreation_reward_func": 0.6195994764566422, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.125, "rewards/xmlcount_reward_func": 0.7918437521439046, "step": 52 }, { "completion_length": 336.3333333333333, "epoch": 3.0, "grad_norm": 1.9638556241989136, "kl": 0.0329820365489771, "learning_rate": 2.37859444471388e-07, "loss": 0.0, "reward": 2.897741069396337, "reward_std": 1.3238216874500115, "rewards/concensus_correctness_reward_func": 0.4622500070060293, "rewards/consensus_reward_func": 0.5, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.5833333333333334, "rewards/question_recreation_reward_func": 0.7555743406216303, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.5965833238636454, "step": 54 }, { "completion_length": 297.90625, "epoch": 3.1142857142857143, "grad_norm": 2.512716770172119, "kl": 0.07957816089037806, "learning_rate": 2.2172154345117894e-07, "loss": 0.0001, "reward": 5.407194800674915, "reward_std": 2.5112848294666037, "rewards/concensus_correctness_reward_func": 2.6179999876767397, "rewards/consensus_reward_func": 1.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.625, "rewards/question_recreation_reward_func": 0.6985698798671365, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.046875, "rewards/xmlcount_reward_func": 0.3562500039115548, "step": 56 }, { "completion_length": 261.375, "epoch": 3.2285714285714286, "grad_norm": 2.727494716644287, "kl": 0.051797536259982735, "learning_rate": 2.0570225210519433e-07, "loss": 0.0001, "reward": 4.303291346877813, "reward_std": 3.120794242247939, "rewards/concensus_correctness_reward_func": 1.5453749848529696, "rewards/consensus_reward_func": 0.625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.8125, "rewards/question_recreation_reward_func": 0.5874788034707308, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.078125, "rewards/xmlcount_reward_func": 0.6548125031404197, "step": 58 }, { "completion_length": 312.5, "epoch": 3.342857142857143, "grad_norm": 2.5867483615875244, "kl": 0.045177310064900666, "learning_rate": 1.8986876090843664e-07, "loss": 0.0, "reward": 6.679476020857692, "reward_std": 6.590652715298347, "rewards/concensus_correctness_reward_func": 3.911812473088503, "rewards/consensus_reward_func": 0.625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.8125, "rewards/question_recreation_reward_func": 0.6978510078042746, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0625, "rewards/xmlcount_reward_func": 0.5698124994523823, "step": 60 }, { "completion_length": 275.46875, "epoch": 3.4571428571428573, "grad_norm": 2.2337958812713623, "kl": 0.05546386865898967, "learning_rate": 1.7428748102551234e-07, "loss": 0.0001, "reward": 4.224702462553978, "reward_std": 2.568043567443965, "rewards/concensus_correctness_reward_func": 1.6509375016321428, "rewards/consensus_reward_func": 0.5625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.5, "rewards/question_recreation_reward_func": 0.5963900072965771, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.046875, "rewards/xmlcount_reward_func": 0.8680000007152557, "step": 62 }, { "completion_length": 254.375, "epoch": 3.571428571428571, "grad_norm": 2.5836949348449707, "kl": 0.06771399604622275, "learning_rate": 1.5902376575912814e-07, "loss": 0.0001, "reward": 5.309353556483984, "reward_std": 2.0958344470709562, "rewards/concensus_correctness_reward_func": 2.4754375047050416, "rewards/consensus_reward_func": 0.9375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.5625, "rewards/question_recreation_reward_func": 0.5066348570398986, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.09375, "rewards/xmlcount_reward_func": 0.7335312599316239, "step": 64 }, { "completion_length": 294.90625, "epoch": 3.685714285714286, "grad_norm": 2.2413110733032227, "kl": 0.058190350187942386, "learning_rate": 1.4414163643562753e-07, "loss": 0.0001, "reward": 4.059148486703634, "reward_std": 2.2921111752657453, "rewards/concensus_correctness_reward_func": 1.5227500088512897, "rewards/consensus_reward_func": 0.4375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.5625, "rewards/question_recreation_reward_func": 0.7638359684497118, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.7569375038146973, "step": 66 }, { "completion_length": 277.53125, "epoch": 3.8, "grad_norm": 2.7358996868133545, "kl": 0.07721107231918722, "learning_rate": 1.2970351387729872e-07, "loss": 0.0001, "reward": 3.4795289039611816, "reward_std": 2.1741816513240337, "rewards/concensus_correctness_reward_func": 0.8386874985590111, "rewards/consensus_reward_func": 0.75, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.6809976994991302, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.078125, "rewards/xmlcount_reward_func": 0.8817187771201134, "step": 68 }, { "completion_length": 256.0625, "epoch": 3.914285714285714, "grad_norm": 4.087591648101807, "kl": 0.0788359681610018, "learning_rate": 1.1576995658775404e-07, "loss": 0.0001, "reward": 6.083272695541382, "reward_std": 4.112470694584772, "rewards/concensus_correctness_reward_func": 3.4234374810475856, "rewards/consensus_reward_func": 0.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.3125, "rewards/question_recreation_reward_func": 0.5136476922780275, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.125, "rewards/xmlcount_reward_func": 0.8336875168606639, "step": 70 }, { "completion_length": 276.5, "epoch": 4.0, "grad_norm": 1.627131700515747, "kl": 0.11795352476959427, "learning_rate": 1.0239940674851941e-07, "loss": 0.0001, "reward": 5.3608784476916, "reward_std": 2.300887676576773, "rewards/concensus_correctness_reward_func": 1.9943333491683006, "rewards/consensus_reward_func": 1.0833333333333333, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.75, "rewards/question_recreation_reward_func": 0.6572117364654938, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.08333333333333333, "rewards/xmlcount_reward_func": 0.7926666811108589, "step": 72 }, { "completion_length": 353.4375, "epoch": 4.114285714285714, "grad_norm": 4.412367820739746, "kl": 0.08782886900007725, "learning_rate": 8.964794509221507e-08, "loss": 0.0001, "reward": 4.319345578551292, "reward_std": 1.4791212249547243, "rewards/concensus_correctness_reward_func": 1.8143750003073364, "rewards/consensus_reward_func": 0.625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.4375, "rewards/question_recreation_reward_func": 0.6734706219285727, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0625, "rewards/xmlcount_reward_func": 0.706500010099262, "step": 74 }, { "completion_length": 298.5, "epoch": 4.228571428571429, "grad_norm": 43.921119689941406, "kl": 0.09398894105106592, "learning_rate": 7.756905568047392e-08, "loss": 0.0001, "reward": 5.068341612815857, "reward_std": 2.5960501823574305, "rewards/concensus_correctness_reward_func": 2.2858750016748672, "rewards/consensus_reward_func": 0.8125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.6875, "rewards/question_recreation_reward_func": 0.6213102764450014, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.046875, "rewards/xmlcount_reward_func": 0.6142812587786466, "step": 76 }, { "completion_length": 279.4375, "epoch": 4.3428571428571425, "grad_norm": 3.3324248790740967, "kl": 0.12617466738447547, "learning_rate": 6.621340157319996e-08, "loss": 0.0001, "reward": 6.855641521513462, "reward_std": 5.589063869789243, "rewards/concensus_correctness_reward_func": 3.5954999728128314, "rewards/consensus_reward_func": 1.125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.6875, "rewards/question_recreation_reward_func": 0.6044852556660771, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.078125, "rewards/xmlcount_reward_func": 0.7650312539190054, "step": 78 }, { "completion_length": 282.25, "epoch": 4.457142857142857, "grad_norm": 2.391388416290283, "kl": 0.0781217070762068, "learning_rate": 5.5628612330087724e-08, "loss": 0.0001, "reward": 5.143940486013889, "reward_std": 3.1008094910066575, "rewards/concensus_correctness_reward_func": 2.1685625007376075, "rewards/consensus_reward_func": 0.6875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.8125, "rewards/question_recreation_reward_func": 0.5825029462575912, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.109375, "rewards/xmlcount_reward_func": 0.7835000064224005, "step": 80 }, { "completion_length": 250.875, "epoch": 4.571428571428571, "grad_norm": 3.275106906890869, "kl": 0.08860545780044049, "learning_rate": 4.5859084235697235e-08, "loss": 0.0001, "reward": 3.176556244492531, "reward_std": 1.5254050176981764, "rewards/concensus_correctness_reward_func": 0.8501874984940514, "rewards/consensus_reward_func": 0.5625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.5377749832696281, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.109375, "rewards/xmlcount_reward_func": 0.9917187504470348, "step": 82 }, { "completion_length": 252.65625, "epoch": 4.685714285714286, "grad_norm": 5.577023029327393, "kl": 0.14010742434766144, "learning_rate": 3.6945794086007705e-08, "loss": 0.0001, "reward": 4.1389394868165255, "reward_std": 2.314908188767731, "rewards/concensus_correctness_reward_func": 1.5294374911900377, "rewards/consensus_reward_func": 0.6875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.625, "rewards/question_recreation_reward_func": 0.560220692306757, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.078125, "rewards/xmlcount_reward_func": 0.6586562575539574, "step": 84 }, { "completion_length": 257.34375, "epoch": 4.8, "grad_norm": 2.5406899452209473, "kl": 0.09263441083021462, "learning_rate": 2.892612731749414e-08, "loss": 0.0001, "reward": 3.948366153985262, "reward_std": 1.6628333161497721, "rewards/concensus_correctness_reward_func": 1.1678125290200114, "rewards/consensus_reward_func": 0.5625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.6875, "rewards/question_recreation_reward_func": 0.6303973635658622, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.09375, "rewards/xmlcount_reward_func": 0.8064062613993883, "step": 86 }, { "completion_length": 266.59375, "epoch": 4.914285714285715, "grad_norm": 2.87119197845459, "kl": 0.11624281201511621, "learning_rate": 2.183372119961499e-08, "loss": 0.0001, "reward": 4.197016902267933, "reward_std": 2.4855765970423818, "rewards/concensus_correctness_reward_func": 1.5803749952465296, "rewards/consensus_reward_func": 0.4375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.5, "rewards/question_recreation_reward_func": 0.6657043690793216, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.078125, "rewards/xmlcount_reward_func": 0.9353125058114529, "step": 88 }, { "completion_length": 259.5833333333333, "epoch": 5.0, "grad_norm": 3.3491158485412598, "kl": 0.16860986345758042, "learning_rate": 1.5698323748414122e-08, "loss": 0.0001, "reward": 7.5917567908763885, "reward_std": 5.551452632372578, "rewards/concensus_correctness_reward_func": 4.24174995906651, "rewards/consensus_reward_func": 0.8333333333333334, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.8333333333333334, "rewards/question_recreation_reward_func": 0.7116317736605803, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.041666666666666664, "rewards/xmlcount_reward_func": 0.9300416857004166, "step": 90 }, { "completion_length": 259.78125, "epoch": 5.114285714285714, "grad_norm": 4.982123851776123, "kl": 0.2259263969026506, "learning_rate": 1.054566895300324e-08, "loss": 0.0002, "reward": 4.731869850307703, "reward_std": 3.6210765979485586, "rewards/concensus_correctness_reward_func": 1.8394375070929527, "rewards/consensus_reward_func": 0.75, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.625, "rewards/question_recreation_reward_func": 0.6873698411509395, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.046875, "rewards/xmlcount_reward_func": 0.7831875099800527, "step": 92 }, { "completion_length": 309.03125, "epoch": 5.228571428571429, "grad_norm": 2.2618565559387207, "kl": 0.2819003712502308, "learning_rate": 6.397368838268496e-09, "loss": 0.0003, "reward": 3.303976181894541, "reward_std": 2.2868103915825486, "rewards/concensus_correctness_reward_func": 0.8359999973326921, "rewards/consensus_reward_func": 0.8125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.6375387134030461, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.09375, "rewards/xmlcount_reward_func": 0.6741875065490603, "step": 94 }, { "completion_length": 263.03125, "epoch": 5.3428571428571425, "grad_norm": 2.844130039215088, "kl": 0.059587346273474395, "learning_rate": 3.2708228165273244e-09, "loss": 0.0001, "reward": 5.61201386898756, "reward_std": 4.463006908656098, "rewards/concensus_correctness_reward_func": 2.957062483765185, "rewards/consensus_reward_func": 0.625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.625, "rewards/question_recreation_reward_func": 0.5966701377183199, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.09375, "rewards/xmlcount_reward_func": 0.7145312689244747, "step": 96 }, { "completion_length": 250.0625, "epoch": 5.457142857142857, "grad_norm": 2.9318857192993164, "kl": 0.06650862575042993, "learning_rate": 1.1791447083465133e-09, "loss": 0.0001, "reward": 5.33496169000864, "reward_std": 1.7955414667958394, "rewards/concensus_correctness_reward_func": 2.302812503403402, "rewards/consensus_reward_func": 0.625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.875, "rewards/question_recreation_reward_func": 0.6075867600739002, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0625, "rewards/xmlcount_reward_func": 0.8620625035837293, "step": 98 }, { "completion_length": 270.4375, "epoch": 5.571428571428571, "grad_norm": 2.9673755168914795, "kl": 0.08620016509667039, "learning_rate": 1.3110773862126667e-10, "loss": 0.0001, "reward": 4.476045485585928, "reward_std": 3.1387285026721656, "rewards/concensus_correctness_reward_func": 1.7854374921880662, "rewards/consensus_reward_func": 0.5, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.5625, "rewards/question_recreation_reward_func": 0.606795561965555, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.109375, "rewards/xmlcount_reward_func": 0.9119375087320805, "step": 100 }, { "epoch": 5.571428571428571, "step": 100, "total_flos": 0.0, "train_loss": 5.454165089759044e-05, "train_runtime": 1795.073, "train_samples_per_second": 0.891, "train_steps_per_second": 0.056 } ], "logging_steps": 2, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }