{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 12.551724137931034, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 261.9375, "epoch": 0.27586206896551724, "grad_norm": 0.5876455307006836, "kl": 0.0, "learning_rate": 1.6666666666666667e-06, "loss": 0.0, "reward": 1.4498194679617882, "reward_std": 1.1243495009839535, "rewards/concensus_correctness_reward_func": 0.11999999731779099, "rewards/consensus_reward_func": 0.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.37772573810070753, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.046875, "rewards/xmlcount_reward_func": 0.592718742787838, "step": 2 }, { "completion_length": 279.15625, "epoch": 0.5517241379310345, "grad_norm": 0.7141730785369873, "kl": 0.0020650627120630816, "learning_rate": 5e-06, "loss": 0.0, "reward": 1.2884040176868439, "reward_std": 1.1638742461800575, "rewards/concensus_correctness_reward_func": 0.12012499943375587, "rewards/consensus_reward_func": 0.125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.5039352774620056, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": 0.3830937538295984, "step": 4 }, { "completion_length": 325.8125, "epoch": 0.8275862068965517, "grad_norm": 0.5803059935569763, "kl": 0.000953761518758256, "learning_rate": 4.99475706559428e-06, "loss": 0.0, "reward": 1.294271882623434, "reward_std": 0.8707563504576683, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.3125, "rewards/question_recreation_reward_func": 0.5154593642801046, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.046875, "rewards/xmlcount_reward_func": 0.4194375015795231, "step": 6 }, { "completion_length": 361.75, "epoch": 1.0, "grad_norm": 0.20475846529006958, "kl": 0.0010331896017305553, "learning_rate": 4.979050253066064e-06, "loss": 0.0, "reward": 1.6736069202423096, "reward_std": 1.0094650149345399, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.5, "rewards/question_recreation_reward_func": 0.552256902679801, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.05, "rewards/xmlcount_reward_func": 0.5713500022888184, "step": 8 }, { "completion_length": 384.46875, "epoch": 1.2758620689655173, "grad_norm": 0.43686386942863464, "kl": 0.0012712998868664727, "learning_rate": 4.952945442245598e-06, "loss": 0.0, "reward": 1.1086738537997007, "reward_std": 0.8807330783456564, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.1875, "rewards/question_recreation_reward_func": 0.4103300729766488, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4483437407761812, "step": 10 }, { "completion_length": 347.28125, "epoch": 1.5517241379310345, "grad_norm": 0.6117482781410217, "kl": 0.0009661500589572825, "learning_rate": 4.916552125781529e-06, "loss": 0.0, "reward": 1.2644198425114155, "reward_std": 0.891217265278101, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.5375448148697615, "rewards/soft_format_reward_func": 0.03125, "rewards/strict_format_reward_func": 0.046875, "rewards/xmlcount_reward_func": 0.3987499997019768, "step": 12 }, { "completion_length": 249.9375, "epoch": 1.8275862068965516, "grad_norm": 0.6757048964500427, "kl": 0.0011229460214963183, "learning_rate": 4.870022949890676e-06, "loss": 0.0, "reward": 1.8286586850881577, "reward_std": 1.1331563144922256, "rewards/concensus_correctness_reward_func": 0.0625, "rewards/consensus_reward_func": 0.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.5625, "rewards/question_recreation_reward_func": 0.5068149194121361, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.078125, "rewards/xmlcount_reward_func": 0.5562187545001507, "step": 14 }, { "completion_length": 275.35, "epoch": 2.0, "grad_norm": 0.31559568643569946, "kl": 0.00098627534462139, "learning_rate": 4.813553074106761e-06, "loss": 0.0, "reward": 2.0888511657714846, "reward_std": 1.4333394169807434, "rewards/concensus_correctness_reward_func": 0.13020000457763672, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.6, "rewards/question_recreation_reward_func": 0.7344011664390564, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.05, "rewards/xmlcount_reward_func": 0.5742499828338623, "step": 16 }, { "completion_length": 302.5, "epoch": 2.2758620689655173, "grad_norm": 0.6794589161872864, "kl": 0.0009313603804912418, "learning_rate": 4.747379352713489e-06, "loss": 0.0, "reward": 2.012173980474472, "reward_std": 1.3338787257671356, "rewards/concensus_correctness_reward_func": 0.10331249982118607, "rewards/consensus_reward_func": 0.125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.8125, "rewards/question_recreation_reward_func": 0.5153615120798349, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4559999890625477, "step": 18 }, { "completion_length": 259.78125, "epoch": 2.5517241379310347, "grad_norm": 0.7855917811393738, "kl": 0.0010634321151883341, "learning_rate": 4.671779341295378e-06, "loss": 0.0, "reward": 1.7313391268253326, "reward_std": 1.3928996622562408, "rewards/concensus_correctness_reward_func": 0.26087500154972076, "rewards/consensus_reward_func": 0.125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.5740891546010971, "rewards/soft_format_reward_func": 0.015625, "rewards/strict_format_reward_func": 0.046875, "rewards/xmlcount_reward_func": 0.45887498930096626, "step": 20 }, { "completion_length": 266.65625, "epoch": 2.8275862068965516, "grad_norm": 0.746308445930481, "kl": 0.0011307917884550989, "learning_rate": 4.587070132573178e-06, "loss": 0.0, "reward": 1.402635745704174, "reward_std": 0.9156965278089046, "rewards/concensus_correctness_reward_func": 0.015625, "rewards/consensus_reward_func": 0.125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.1875, "rewards/question_recreation_reward_func": 0.4964794850675389, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": 0.5467812465503812, "step": 22 }, { "completion_length": 327.05, "epoch": 3.0, "grad_norm": 0.23480382561683655, "kl": 0.00111085653770715, "learning_rate": 4.493607026406802e-06, "loss": 0.0, "reward": 1.1529444456100464, "reward_std": 0.523932871222496, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.2, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.5830944389104843, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.025, "rewards/xmlcount_reward_func": 0.34484999179840087, "step": 24 }, { "completion_length": 336.8125, "epoch": 3.2758620689655173, "grad_norm": 0.8294694423675537, "kl": 0.0010076407925225794, "learning_rate": 4.391782039544239e-06, "loss": 0.0, "reward": 1.3900708109140396, "reward_std": 1.3332865573465824, "rewards/concensus_correctness_reward_func": 0.06012500077486038, "rewards/consensus_reward_func": 0.1875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.1875, "rewards/question_recreation_reward_func": 0.46316459868103266, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.4761562645435333, "step": 26 }, { "completion_length": 353.28125, "epoch": 3.5517241379310347, "grad_norm": 0.6492119431495667, "kl": 0.0011136386019643396, "learning_rate": 4.282022261367074e-06, "loss": 0.0, "reward": 1.7649768702685833, "reward_std": 1.058425359427929, "rewards/concensus_correctness_reward_func": 0.015625, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.625, "rewards/question_recreation_reward_func": 0.6061330996453762, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.5025937429745682, "step": 28 }, { "completion_length": 281.6875, "epoch": 3.8275862068965516, "grad_norm": 0.8953530192375183, "kl": 0.001063827752659563, "learning_rate": 4.164788062529203e-06, "loss": 0.0, "reward": 2.2271154075860977, "reward_std": 1.9515271224081516, "rewards/concensus_correctness_reward_func": 0.640625, "rewards/consensus_reward_func": 0.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.3125, "rewards/question_recreation_reward_func": 0.57742790132761, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.6184374950826168, "step": 30 }, { "completion_length": 268.9, "epoch": 4.0, "grad_norm": 0.5744255781173706, "kl": 0.001324763905722648, "learning_rate": 4.040571164002319e-06, "loss": 0.0, "reward": 1.4024577617645264, "reward_std": 1.0153881907463074, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.3, "rewards/question_recreation_reward_func": 0.6460577547550201, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.05, "rewards/xmlcount_reward_func": 0.4063999891281128, "step": 32 }, { "completion_length": 315.125, "epoch": 4.275862068965517, "grad_norm": 1.165102243423462, "kl": 0.000977253686869517, "learning_rate": 3.909892574627267e-06, "loss": 0.0, "reward": 1.1481780782341957, "reward_std": 1.0175574347376823, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.3283343203365803, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4448437672108412, "step": 34 }, { "completion_length": 285.3125, "epoch": 4.551724137931035, "grad_norm": 0.6390895843505859, "kl": 0.0010886134841712192, "learning_rate": 3.773300405821908e-06, "loss": 0.0, "reward": 1.2504443675279617, "reward_std": 1.2421578019857407, "rewards/concensus_correctness_reward_func": 0.07575000077486038, "rewards/consensus_reward_func": 0.1875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.1875, "rewards/question_recreation_reward_func": 0.4656319017522037, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3340625036507845, "step": 36 }, { "completion_length": 261.6875, "epoch": 4.827586206896552, "grad_norm": 0.6622844934463501, "kl": 0.0011051035835407674, "learning_rate": 3.631367572611348e-06, "loss": 0.0, "reward": 1.0794794410467148, "reward_std": 0.8599886745214462, "rewards/concensus_correctness_reward_func": 0.0625, "rewards/consensus_reward_func": 0.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.5531356520950794, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": 0.24509375356137753, "step": 38 }, { "completion_length": 317.2, "epoch": 5.0, "grad_norm": 0.6508445143699646, "kl": 0.0012014877400361, "learning_rate": 3.484689390623218e-06, "loss": 0.0, "reward": 1.2601001858711243, "reward_std": 0.9332681000232697, "rewards/concensus_correctness_reward_func": 0.09599999785423279, "rewards/consensus_reward_func": 0.1, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.5755501747131347, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.025, "rewards/xmlcount_reward_func": 0.46354998350143434, "step": 40 }, { "completion_length": 348.0625, "epoch": 5.275862068965517, "grad_norm": 0.4629547595977783, "kl": 0.0010141393868252635, "learning_rate": 3.333881079127052e-06, "loss": 0.0, "reward": 1.3980103582143784, "reward_std": 1.1040666736662388, "rewards/concensus_correctness_reward_func": 0.13574999943375587, "rewards/consensus_reward_func": 0.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.6644478440284729, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": 0.3790625077672303, "step": 42 }, { "completion_length": 273.65625, "epoch": 5.551724137931035, "grad_norm": 0.7342892289161682, "kl": 0.0011725125659722835, "learning_rate": 3.1795751805908578e-06, "loss": 0.0, "reward": 2.422099143266678, "reward_std": 2.033245772123337, "rewards/concensus_correctness_reward_func": 0.7265625, "rewards/consensus_reward_func": 0.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.625, "rewards/question_recreation_reward_func": 0.5944116413593292, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.3980000060983002, "step": 44 }, { "completion_length": 379.8125, "epoch": 5.827586206896552, "grad_norm": 0.652703046798706, "kl": 0.0010911910067079589, "learning_rate": 3.0224189075781886e-06, "loss": 0.0, "reward": 1.2743461802601814, "reward_std": 1.3854865245521069, "rewards/concensus_correctness_reward_func": 0.12262500077486038, "rewards/consensus_reward_func": 0.125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.1875, "rewards/question_recreation_reward_func": 0.4171899161301553, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.42203125543892384, "step": 46 }, { "completion_length": 318.95, "epoch": 6.0, "grad_norm": 0.22338756918907166, "kl": 0.001001153700053692, "learning_rate": 2.8630714281137263e-06, "loss": 0.0, "reward": 1.5304819822311402, "reward_std": 1.059793508052826, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.1, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.5, "rewards/question_recreation_reward_func": 0.38448197543621065, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.025, "rewards/xmlcount_reward_func": 0.5209999918937683, "step": 48 }, { "completion_length": 370.46875, "epoch": 6.275862068965517, "grad_norm": 0.6062008738517761, "kl": 0.0010415500364615582, "learning_rate": 2.702201100903511e-06, "loss": 0.0, "reward": 1.4714525565505028, "reward_std": 1.0109544545412064, "rewards/concensus_correctness_reward_func": 0.0755000002682209, "rewards/consensus_reward_func": 0.125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.3125, "rewards/question_recreation_reward_func": 0.4757650615647435, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4826874975115061, "step": 50 }, { "completion_length": 339.28125, "epoch": 6.551724137931035, "grad_norm": 0.6081772446632385, "kl": 0.000928263645619154, "learning_rate": 2.5404826720062544e-06, "loss": 0.0, "reward": 1.6288059502840042, "reward_std": 1.1489972844719887, "rewards/concensus_correctness_reward_func": 0.03125, "rewards/consensus_reward_func": 0.125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.5625, "rewards/question_recreation_reward_func": 0.5305872187018394, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3794687408953905, "step": 52 }, { "completion_length": 294.8125, "epoch": 6.827586206896552, "grad_norm": 0.7885801196098328, "kl": 0.0011745219817385077, "learning_rate": 2.3785944447138804e-06, "loss": 0.0, "reward": 1.3874307684600353, "reward_std": 1.1232963781803846, "rewards/concensus_correctness_reward_func": 0.12262500077486038, "rewards/consensus_reward_func": 0.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.1875, "rewards/question_recreation_reward_func": 0.5388057269155979, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.046875, "rewards/xmlcount_reward_func": 0.42912499886006117, "step": 54 }, { "completion_length": 335.2, "epoch": 7.0, "grad_norm": 0.35713574290275574, "kl": 0.0010486021172255277, "learning_rate": 2.2172154345117896e-06, "loss": 0.0, "reward": 0.9908310234546661, "reward_std": 0.8158440917730332, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.2, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.2, "rewards/question_recreation_reward_func": 0.24703103601932525, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.025, "rewards/xmlcount_reward_func": 0.31880000084638593, "step": 56 }, { "completion_length": 269.875, "epoch": 7.275862068965517, "grad_norm": 0.7617030143737793, "kl": 0.0012455169853637926, "learning_rate": 2.0570225210519433e-06, "loss": 0.0, "reward": 1.2766155302524567, "reward_std": 0.9013489130884409, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.498896773904562, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.046875, "rewards/xmlcount_reward_func": 0.5433437526226044, "step": 58 }, { "completion_length": 359.75, "epoch": 7.551724137931035, "grad_norm": 0.4766957759857178, "kl": 0.0010567399804131128, "learning_rate": 1.8986876090843668e-06, "loss": 0.0, "reward": 1.335220292210579, "reward_std": 1.233029380440712, "rewards/concensus_correctness_reward_func": 0.03125, "rewards/consensus_reward_func": 0.125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.3125, "rewards/question_recreation_reward_func": 0.4641265389509499, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4023437686264515, "step": 60 }, { "completion_length": 317.21875, "epoch": 7.827586206896552, "grad_norm": 0.8781487941741943, "kl": 0.0012130630420870148, "learning_rate": 1.7428748102551237e-06, "loss": 0.0, "reward": 1.2447976544499397, "reward_std": 0.9147735517472029, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.5317039042711258, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": 0.4318437557667494, "step": 62 }, { "completion_length": 291.95, "epoch": 8.0, "grad_norm": 0.44187262654304504, "kl": 0.0014347493182867766, "learning_rate": 1.5902376575912815e-06, "loss": 0.0, "reward": 3.6168224096298216, "reward_std": 2.845296561717987, "rewards/concensus_correctness_reward_func": 2.0, "rewards/consensus_reward_func": 0.3, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.3, "rewards/question_recreation_reward_func": 0.3940222471952438, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.05, "rewards/xmlcount_reward_func": 0.5728000104427338, "step": 64 }, { "completion_length": 306.9375, "epoch": 8.275862068965518, "grad_norm": 1.304425835609436, "kl": 0.0012883242161478847, "learning_rate": 1.4414163643562755e-06, "loss": 0.0, "reward": 1.414997085928917, "reward_std": 0.9384825639426708, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.1875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.1875, "rewards/question_recreation_reward_func": 0.5457158237695694, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": 0.4630312602967024, "step": 66 }, { "completion_length": 275.09375, "epoch": 8.551724137931034, "grad_norm": 0.7772114872932434, "kl": 0.0014374244710779749, "learning_rate": 1.2970351387729875e-06, "loss": 0.0, "reward": 1.7658802941441536, "reward_std": 1.1716703101992607, "rewards/concensus_correctness_reward_func": 0.11599999666213989, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.5625, "rewards/question_recreation_reward_func": 0.5250677652657032, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0625, "rewards/xmlcount_reward_func": 0.49981250055134296, "step": 68 }, { "completion_length": 330.53125, "epoch": 8.827586206896552, "grad_norm": 0.8543239235877991, "kl": 0.0013485400995705277, "learning_rate": 1.1576995658775405e-06, "loss": 0.0, "reward": 1.2814906537532806, "reward_std": 1.0425735749304295, "rewards/concensus_correctness_reward_func": 0.09137500077486038, "rewards/consensus_reward_func": 0.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.3125, "rewards/question_recreation_reward_func": 0.4742719018831849, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": 0.30959376133978367, "step": 70 }, { "completion_length": 403.5, "epoch": 9.0, "grad_norm": 0.21790927648544312, "kl": 0.001029879879206419, "learning_rate": 1.0239940674851943e-06, "loss": 0.0, "reward": 1.454481029510498, "reward_std": 0.7960291028022766, "rewards/concensus_correctness_reward_func": 0.05, "rewards/consensus_reward_func": 0.1, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.2, "rewards/question_recreation_reward_func": 0.46478102207183836, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.025, "rewards/xmlcount_reward_func": 0.6146999955177307, "step": 72 }, { "completion_length": 258.78125, "epoch": 9.275862068965518, "grad_norm": 0.9624819755554199, "kl": 0.0016905390948522836, "learning_rate": 8.964794509221508e-07, "loss": 0.0, "reward": 1.8803276717662811, "reward_std": 1.6410220339894295, "rewards/concensus_correctness_reward_func": 0.18275000154972076, "rewards/consensus_reward_func": 0.125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.5239214114844799, "rewards/soft_format_reward_func": 0.015625, "rewards/strict_format_reward_func": 0.0625, "rewards/xmlcount_reward_func": 0.720531240105629, "step": 74 }, { "completion_length": 320.03125, "epoch": 9.551724137931034, "grad_norm": 0.7233691811561584, "kl": 0.0010870415571844205, "learning_rate": 7.756905568047393e-07, "loss": 0.0, "reward": 1.6069397777318954, "reward_std": 1.0762585289776325, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.5746584795415401, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.641656244173646, "step": 76 }, { "completion_length": 288.40625, "epoch": 9.827586206896552, "grad_norm": 0.6633221507072449, "kl": 0.0014429407165152952, "learning_rate": 6.621340157319998e-07, "loss": 0.0, "reward": 1.9194584637880325, "reward_std": 1.9881719145923853, "rewards/concensus_correctness_reward_func": 0.640625, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.46661476604640484, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.5465937443077564, "step": 78 }, { "completion_length": 348.0, "epoch": 10.0, "grad_norm": 0.4076433479785919, "kl": 0.001379342400468886, "learning_rate": 5.562861233008774e-07, "loss": 0.0, "reward": 0.835002475976944, "reward_std": 0.8312704622745514, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.3, "rewards/question_recreation_reward_func": 0.41810246780514715, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11689999997615814, "step": 80 }, { "completion_length": 284.5625, "epoch": 10.275862068965518, "grad_norm": 13.410774230957031, "kl": 0.002484311757143587, "learning_rate": 4.5859084235697236e-07, "loss": 0.0, "reward": 1.334033541381359, "reward_std": 1.2119258306920528, "rewards/concensus_correctness_reward_func": 0.12262500077486038, "rewards/consensus_reward_func": 0.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.591752303764224, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": 0.4009062615223229, "step": 82 }, { "completion_length": 283.3125, "epoch": 10.551724137931034, "grad_norm": 0.5552597641944885, "kl": 0.0015608746980433352, "learning_rate": 3.6945794086007706e-07, "loss": 0.0, "reward": 1.3054824657738209, "reward_std": 0.9629417397081852, "rewards/concensus_correctness_reward_func": 0.08137500286102295, "rewards/consensus_reward_func": 0.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.1875, "rewards/question_recreation_reward_func": 0.5141699481755495, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.4443124867975712, "step": 84 }, { "completion_length": 349.125, "epoch": 10.827586206896552, "grad_norm": 0.5144564509391785, "kl": 0.0014357663603732362, "learning_rate": 2.892612731749414e-07, "loss": 0.0, "reward": 2.4467398822307587, "reward_std": 2.3171303123235703, "rewards/concensus_correctness_reward_func": 0.7409999966621399, "rewards/consensus_reward_func": 0.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.5, "rewards/question_recreation_reward_func": 0.6876149065792561, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.43999999947845936, "step": 86 }, { "completion_length": 266.25, "epoch": 11.0, "grad_norm": 0.537667453289032, "kl": 0.0013617533957585692, "learning_rate": 2.1833721199614992e-07, "loss": 0.0, "reward": 1.3557676315307616, "reward_std": 1.1246003568172456, "rewards/concensus_correctness_reward_func": 0.09620000123977661, "rewards/consensus_reward_func": 0.1, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.1, "rewards/question_recreation_reward_func": 0.37711760550737383, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.025, "rewards/xmlcount_reward_func": 0.6574500024318695, "step": 88 }, { "completion_length": 298.59375, "epoch": 11.275862068965518, "grad_norm": 0.7764319181442261, "kl": 0.0013516054386855103, "learning_rate": 1.5698323748414123e-07, "loss": 0.0, "reward": 1.9731452241539955, "reward_std": 2.4217655174434185, "rewards/concensus_correctness_reward_func": 0.8366875015199184, "rewards/consensus_reward_func": 0.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.5108639299869537, "rewards/soft_format_reward_func": 0.015625, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29746875166893005, "step": 90 }, { "completion_length": 265.59375, "epoch": 11.551724137931034, "grad_norm": 0.7998268008232117, "kl": 0.0019906353700207546, "learning_rate": 1.054566895300324e-07, "loss": 0.0, "reward": 1.2235557958483696, "reward_std": 0.7784941829741001, "rewards/concensus_correctness_reward_func": 0.04337500035762787, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.5579620627686381, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": 0.4659687466919422, "step": 92 }, { "completion_length": 302.46875, "epoch": 11.827586206896552, "grad_norm": 0.5803750157356262, "kl": 0.0012688033384620212, "learning_rate": 6.397368838268497e-08, "loss": 0.0, "reward": 1.6255936734378338, "reward_std": 1.3891006745398045, "rewards/concensus_correctness_reward_func": 0.1693749986588955, "rewards/consensus_reward_func": 0.125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.4375, "rewards/question_recreation_reward_func": 0.5416874596849084, "rewards/soft_format_reward_func": 0.015625, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.3207812551409006, "step": 94 }, { "completion_length": 357.85, "epoch": 12.0, "grad_norm": 0.24747301638126373, "kl": 0.0011559076607227325, "learning_rate": 3.270822816527325e-08, "loss": 0.0, "reward": 1.146351419389248, "reward_std": 1.193832767009735, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.4, "rewards/question_recreation_reward_func": 0.5318014442920684, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.025, "rewards/xmlcount_reward_func": 0.18954999446868898, "step": 96 }, { "completion_length": 286.65625, "epoch": 12.275862068965518, "grad_norm": 0.8936555981636047, "kl": 0.001531564790639095, "learning_rate": 1.1791447083465136e-08, "loss": 0.0, "reward": 2.939312696456909, "reward_std": 2.551104363054037, "rewards/concensus_correctness_reward_func": 1.25, "rewards/consensus_reward_func": 0.25, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.5, "rewards/question_recreation_reward_func": 0.48412513616494834, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4551875004544854, "step": 98 }, { "completion_length": 366.78125, "epoch": 12.551724137931034, "grad_norm": 0.5418797135353088, "kl": 0.001285279548028484, "learning_rate": 1.3110773862126669e-09, "loss": 0.0, "reward": 1.3908307328820229, "reward_std": 1.0579438470304012, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.6382682472467422, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.4244375005364418, "step": 100 }, { "epoch": 12.551724137931034, "step": 100, "total_flos": 0.0, "train_loss": 1.1221799382710174e-06, "train_runtime": 1585.3495, "train_samples_per_second": 1.009, "train_steps_per_second": 0.063 } ], "logging_steps": 2, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 15, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }