itufilum's picture
End of training
603ed1e verified
raw
history blame
38.1 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 12.551724137931034,
"eval_steps": 500,
"global_step": 100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 261.9375,
"epoch": 0.27586206896551724,
"grad_norm": 0.5876455307006836,
"kl": 0.0,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.0,
"reward": 1.4498194679617882,
"reward_std": 1.1243495009839535,
"rewards/concensus_correctness_reward_func": 0.11999999731779099,
"rewards/consensus_reward_func": 0.0625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 0.37772573810070753,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.592718742787838,
"step": 2
},
{
"completion_length": 279.15625,
"epoch": 0.5517241379310345,
"grad_norm": 0.7141730785369873,
"kl": 0.0020650627120630816,
"learning_rate": 5e-06,
"loss": 0.0,
"reward": 1.2884040176868439,
"reward_std": 1.1638742461800575,
"rewards/concensus_correctness_reward_func": 0.12012499943375587,
"rewards/consensus_reward_func": 0.125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 0.5039352774620056,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.03125,
"rewards/xmlcount_reward_func": 0.3830937538295984,
"step": 4
},
{
"completion_length": 325.8125,
"epoch": 0.8275862068965517,
"grad_norm": 0.5803059935569763,
"kl": 0.000953761518758256,
"learning_rate": 4.99475706559428e-06,
"loss": 0.0,
"reward": 1.294271882623434,
"reward_std": 0.8707563504576683,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.3125,
"rewards/question_recreation_reward_func": 0.5154593642801046,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.4194375015795231,
"step": 6
},
{
"completion_length": 361.75,
"epoch": 1.0,
"grad_norm": 0.20475846529006958,
"kl": 0.0010331896017305553,
"learning_rate": 4.979050253066064e-06,
"loss": 0.0,
"reward": 1.6736069202423096,
"reward_std": 1.0094650149345399,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.5,
"rewards/question_recreation_reward_func": 0.552256902679801,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.05,
"rewards/xmlcount_reward_func": 0.5713500022888184,
"step": 8
},
{
"completion_length": 384.46875,
"epoch": 1.2758620689655173,
"grad_norm": 0.43686386942863464,
"kl": 0.0012712998868664727,
"learning_rate": 4.952945442245598e-06,
"loss": 0.0,
"reward": 1.1086738537997007,
"reward_std": 0.8807330783456564,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.0625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.1875,
"rewards/question_recreation_reward_func": 0.4103300729766488,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.4483437407761812,
"step": 10
},
{
"completion_length": 347.28125,
"epoch": 1.5517241379310345,
"grad_norm": 0.6117482781410217,
"kl": 0.0009661500589572825,
"learning_rate": 4.916552125781529e-06,
"loss": 0.0,
"reward": 1.2644198425114155,
"reward_std": 0.891217265278101,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 0.5375448148697615,
"rewards/soft_format_reward_func": 0.03125,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.3987499997019768,
"step": 12
},
{
"completion_length": 249.9375,
"epoch": 1.8275862068965516,
"grad_norm": 0.6757048964500427,
"kl": 0.0011229460214963183,
"learning_rate": 4.870022949890676e-06,
"loss": 0.0,
"reward": 1.8286586850881577,
"reward_std": 1.1331563144922256,
"rewards/concensus_correctness_reward_func": 0.0625,
"rewards/consensus_reward_func": 0.0625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.5625,
"rewards/question_recreation_reward_func": 0.5068149194121361,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.078125,
"rewards/xmlcount_reward_func": 0.5562187545001507,
"step": 14
},
{
"completion_length": 275.35,
"epoch": 2.0,
"grad_norm": 0.31559568643569946,
"kl": 0.00098627534462139,
"learning_rate": 4.813553074106761e-06,
"loss": 0.0,
"reward": 2.0888511657714846,
"reward_std": 1.4333394169807434,
"rewards/concensus_correctness_reward_func": 0.13020000457763672,
"rewards/consensus_reward_func": 0.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.6,
"rewards/question_recreation_reward_func": 0.7344011664390564,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.05,
"rewards/xmlcount_reward_func": 0.5742499828338623,
"step": 16
},
{
"completion_length": 302.5,
"epoch": 2.2758620689655173,
"grad_norm": 0.6794589161872864,
"kl": 0.0009313603804912418,
"learning_rate": 4.747379352713489e-06,
"loss": 0.0,
"reward": 2.012173980474472,
"reward_std": 1.3338787257671356,
"rewards/concensus_correctness_reward_func": 0.10331249982118607,
"rewards/consensus_reward_func": 0.125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.8125,
"rewards/question_recreation_reward_func": 0.5153615120798349,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.4559999890625477,
"step": 18
},
{
"completion_length": 259.78125,
"epoch": 2.5517241379310347,
"grad_norm": 0.7855917811393738,
"kl": 0.0010634321151883341,
"learning_rate": 4.671779341295378e-06,
"loss": 0.0,
"reward": 1.7313391268253326,
"reward_std": 1.3928996622562408,
"rewards/concensus_correctness_reward_func": 0.26087500154972076,
"rewards/consensus_reward_func": 0.125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 0.5740891546010971,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.45887498930096626,
"step": 20
},
{
"completion_length": 266.65625,
"epoch": 2.8275862068965516,
"grad_norm": 0.746308445930481,
"kl": 0.0011307917884550989,
"learning_rate": 4.587070132573178e-06,
"loss": 0.0,
"reward": 1.402635745704174,
"reward_std": 0.9156965278089046,
"rewards/concensus_correctness_reward_func": 0.015625,
"rewards/consensus_reward_func": 0.125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.1875,
"rewards/question_recreation_reward_func": 0.4964794850675389,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.03125,
"rewards/xmlcount_reward_func": 0.5467812465503812,
"step": 22
},
{
"completion_length": 327.05,
"epoch": 3.0,
"grad_norm": 0.23480382561683655,
"kl": 0.00111085653770715,
"learning_rate": 4.493607026406802e-06,
"loss": 0.0,
"reward": 1.1529444456100464,
"reward_std": 0.523932871222496,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.2,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.5830944389104843,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.025,
"rewards/xmlcount_reward_func": 0.34484999179840087,
"step": 24
},
{
"completion_length": 336.8125,
"epoch": 3.2758620689655173,
"grad_norm": 0.8294694423675537,
"kl": 0.0010076407925225794,
"learning_rate": 4.391782039544239e-06,
"loss": 0.0,
"reward": 1.3900708109140396,
"reward_std": 1.3332865573465824,
"rewards/concensus_correctness_reward_func": 0.06012500077486038,
"rewards/consensus_reward_func": 0.1875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.1875,
"rewards/question_recreation_reward_func": 0.46316459868103266,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.4761562645435333,
"step": 26
},
{
"completion_length": 353.28125,
"epoch": 3.5517241379310347,
"grad_norm": 0.6492119431495667,
"kl": 0.0011136386019643396,
"learning_rate": 4.282022261367074e-06,
"loss": 0.0,
"reward": 1.7649768702685833,
"reward_std": 1.058425359427929,
"rewards/concensus_correctness_reward_func": 0.015625,
"rewards/consensus_reward_func": 0.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.625,
"rewards/question_recreation_reward_func": 0.6061330996453762,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.5025937429745682,
"step": 28
},
{
"completion_length": 281.6875,
"epoch": 3.8275862068965516,
"grad_norm": 0.8953530192375183,
"kl": 0.001063827752659563,
"learning_rate": 4.164788062529203e-06,
"loss": 0.0,
"reward": 2.2271154075860977,
"reward_std": 1.9515271224081516,
"rewards/concensus_correctness_reward_func": 0.640625,
"rewards/consensus_reward_func": 0.0625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.3125,
"rewards/question_recreation_reward_func": 0.57742790132761,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.6184374950826168,
"step": 30
},
{
"completion_length": 268.9,
"epoch": 4.0,
"grad_norm": 0.5744255781173706,
"kl": 0.001324763905722648,
"learning_rate": 4.040571164002319e-06,
"loss": 0.0,
"reward": 1.4024577617645264,
"reward_std": 1.0153881907463074,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.3,
"rewards/question_recreation_reward_func": 0.6460577547550201,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.05,
"rewards/xmlcount_reward_func": 0.4063999891281128,
"step": 32
},
{
"completion_length": 315.125,
"epoch": 4.275862068965517,
"grad_norm": 1.165102243423462,
"kl": 0.000977253686869517,
"learning_rate": 3.909892574627267e-06,
"loss": 0.0,
"reward": 1.1481780782341957,
"reward_std": 1.0175574347376823,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 0.3283343203365803,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.4448437672108412,
"step": 34
},
{
"completion_length": 285.3125,
"epoch": 4.551724137931035,
"grad_norm": 0.6390895843505859,
"kl": 0.0010886134841712192,
"learning_rate": 3.773300405821908e-06,
"loss": 0.0,
"reward": 1.2504443675279617,
"reward_std": 1.2421578019857407,
"rewards/concensus_correctness_reward_func": 0.07575000077486038,
"rewards/consensus_reward_func": 0.1875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.1875,
"rewards/question_recreation_reward_func": 0.4656319017522037,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.3340625036507845,
"step": 36
},
{
"completion_length": 261.6875,
"epoch": 4.827586206896552,
"grad_norm": 0.6622844934463501,
"kl": 0.0011051035835407674,
"learning_rate": 3.631367572611348e-06,
"loss": 0.0,
"reward": 1.0794794410467148,
"reward_std": 0.8599886745214462,
"rewards/concensus_correctness_reward_func": 0.0625,
"rewards/consensus_reward_func": 0.0625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 0.5531356520950794,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.03125,
"rewards/xmlcount_reward_func": 0.24509375356137753,
"step": 38
},
{
"completion_length": 317.2,
"epoch": 5.0,
"grad_norm": 0.6508445143699646,
"kl": 0.0012014877400361,
"learning_rate": 3.484689390623218e-06,
"loss": 0.0,
"reward": 1.2601001858711243,
"reward_std": 0.9332681000232697,
"rewards/concensus_correctness_reward_func": 0.09599999785423279,
"rewards/consensus_reward_func": 0.1,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.5755501747131347,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.025,
"rewards/xmlcount_reward_func": 0.46354998350143434,
"step": 40
},
{
"completion_length": 348.0625,
"epoch": 5.275862068965517,
"grad_norm": 0.4629547595977783,
"kl": 0.0010141393868252635,
"learning_rate": 3.333881079127052e-06,
"loss": 0.0,
"reward": 1.3980103582143784,
"reward_std": 1.1040666736662388,
"rewards/concensus_correctness_reward_func": 0.13574999943375587,
"rewards/consensus_reward_func": 0.0625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 0.6644478440284729,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.03125,
"rewards/xmlcount_reward_func": 0.3790625077672303,
"step": 42
},
{
"completion_length": 273.65625,
"epoch": 5.551724137931035,
"grad_norm": 0.7342892289161682,
"kl": 0.0011725125659722835,
"learning_rate": 3.1795751805908578e-06,
"loss": 0.0,
"reward": 2.422099143266678,
"reward_std": 2.033245772123337,
"rewards/concensus_correctness_reward_func": 0.7265625,
"rewards/consensus_reward_func": 0.0625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.625,
"rewards/question_recreation_reward_func": 0.5944116413593292,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.3980000060983002,
"step": 44
},
{
"completion_length": 379.8125,
"epoch": 5.827586206896552,
"grad_norm": 0.652703046798706,
"kl": 0.0010911910067079589,
"learning_rate": 3.0224189075781886e-06,
"loss": 0.0,
"reward": 1.2743461802601814,
"reward_std": 1.3854865245521069,
"rewards/concensus_correctness_reward_func": 0.12262500077486038,
"rewards/consensus_reward_func": 0.125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.1875,
"rewards/question_recreation_reward_func": 0.4171899161301553,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.42203125543892384,
"step": 46
},
{
"completion_length": 318.95,
"epoch": 6.0,
"grad_norm": 0.22338756918907166,
"kl": 0.001001153700053692,
"learning_rate": 2.8630714281137263e-06,
"loss": 0.0,
"reward": 1.5304819822311402,
"reward_std": 1.059793508052826,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.1,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.5,
"rewards/question_recreation_reward_func": 0.38448197543621065,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.025,
"rewards/xmlcount_reward_func": 0.5209999918937683,
"step": 48
},
{
"completion_length": 370.46875,
"epoch": 6.275862068965517,
"grad_norm": 0.6062008738517761,
"kl": 0.0010415500364615582,
"learning_rate": 2.702201100903511e-06,
"loss": 0.0,
"reward": 1.4714525565505028,
"reward_std": 1.0109544545412064,
"rewards/concensus_correctness_reward_func": 0.0755000002682209,
"rewards/consensus_reward_func": 0.125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.3125,
"rewards/question_recreation_reward_func": 0.4757650615647435,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.4826874975115061,
"step": 50
},
{
"completion_length": 339.28125,
"epoch": 6.551724137931035,
"grad_norm": 0.6081772446632385,
"kl": 0.000928263645619154,
"learning_rate": 2.5404826720062544e-06,
"loss": 0.0,
"reward": 1.6288059502840042,
"reward_std": 1.1489972844719887,
"rewards/concensus_correctness_reward_func": 0.03125,
"rewards/consensus_reward_func": 0.125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.5625,
"rewards/question_recreation_reward_func": 0.5305872187018394,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.3794687408953905,
"step": 52
},
{
"completion_length": 294.8125,
"epoch": 6.827586206896552,
"grad_norm": 0.7885801196098328,
"kl": 0.0011745219817385077,
"learning_rate": 2.3785944447138804e-06,
"loss": 0.0,
"reward": 1.3874307684600353,
"reward_std": 1.1232963781803846,
"rewards/concensus_correctness_reward_func": 0.12262500077486038,
"rewards/consensus_reward_func": 0.0625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.1875,
"rewards/question_recreation_reward_func": 0.5388057269155979,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.42912499886006117,
"step": 54
},
{
"completion_length": 335.2,
"epoch": 7.0,
"grad_norm": 0.35713574290275574,
"kl": 0.0010486021172255277,
"learning_rate": 2.2172154345117896e-06,
"loss": 0.0,
"reward": 0.9908310234546661,
"reward_std": 0.8158440917730332,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.2,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.2,
"rewards/question_recreation_reward_func": 0.24703103601932525,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.025,
"rewards/xmlcount_reward_func": 0.31880000084638593,
"step": 56
},
{
"completion_length": 269.875,
"epoch": 7.275862068965517,
"grad_norm": 0.7617030143737793,
"kl": 0.0012455169853637926,
"learning_rate": 2.0570225210519433e-06,
"loss": 0.0,
"reward": 1.2766155302524567,
"reward_std": 0.9013489130884409,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.0625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 0.498896773904562,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.5433437526226044,
"step": 58
},
{
"completion_length": 359.75,
"epoch": 7.551724137931035,
"grad_norm": 0.4766957759857178,
"kl": 0.0010567399804131128,
"learning_rate": 1.8986876090843668e-06,
"loss": 0.0,
"reward": 1.335220292210579,
"reward_std": 1.233029380440712,
"rewards/concensus_correctness_reward_func": 0.03125,
"rewards/consensus_reward_func": 0.125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.3125,
"rewards/question_recreation_reward_func": 0.4641265389509499,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.4023437686264515,
"step": 60
},
{
"completion_length": 317.21875,
"epoch": 7.827586206896552,
"grad_norm": 0.8781487941741943,
"kl": 0.0012130630420870148,
"learning_rate": 1.7428748102551237e-06,
"loss": 0.0,
"reward": 1.2447976544499397,
"reward_std": 0.9147735517472029,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 0.5317039042711258,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.03125,
"rewards/xmlcount_reward_func": 0.4318437557667494,
"step": 62
},
{
"completion_length": 291.95,
"epoch": 8.0,
"grad_norm": 0.44187262654304504,
"kl": 0.0014347493182867766,
"learning_rate": 1.5902376575912815e-06,
"loss": 0.0,
"reward": 3.6168224096298216,
"reward_std": 2.845296561717987,
"rewards/concensus_correctness_reward_func": 2.0,
"rewards/consensus_reward_func": 0.3,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.3,
"rewards/question_recreation_reward_func": 0.3940222471952438,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.05,
"rewards/xmlcount_reward_func": 0.5728000104427338,
"step": 64
},
{
"completion_length": 306.9375,
"epoch": 8.275862068965518,
"grad_norm": 1.304425835609436,
"kl": 0.0012883242161478847,
"learning_rate": 1.4414163643562755e-06,
"loss": 0.0,
"reward": 1.414997085928917,
"reward_std": 0.9384825639426708,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.1875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.1875,
"rewards/question_recreation_reward_func": 0.5457158237695694,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.03125,
"rewards/xmlcount_reward_func": 0.4630312602967024,
"step": 66
},
{
"completion_length": 275.09375,
"epoch": 8.551724137931034,
"grad_norm": 0.7772114872932434,
"kl": 0.0014374244710779749,
"learning_rate": 1.2970351387729875e-06,
"loss": 0.0,
"reward": 1.7658802941441536,
"reward_std": 1.1716703101992607,
"rewards/concensus_correctness_reward_func": 0.11599999666213989,
"rewards/consensus_reward_func": 0.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.5625,
"rewards/question_recreation_reward_func": 0.5250677652657032,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.49981250055134296,
"step": 68
},
{
"completion_length": 330.53125,
"epoch": 8.827586206896552,
"grad_norm": 0.8543239235877991,
"kl": 0.0013485400995705277,
"learning_rate": 1.1576995658775405e-06,
"loss": 0.0,
"reward": 1.2814906537532806,
"reward_std": 1.0425735749304295,
"rewards/concensus_correctness_reward_func": 0.09137500077486038,
"rewards/consensus_reward_func": 0.0625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.3125,
"rewards/question_recreation_reward_func": 0.4742719018831849,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.03125,
"rewards/xmlcount_reward_func": 0.30959376133978367,
"step": 70
},
{
"completion_length": 403.5,
"epoch": 9.0,
"grad_norm": 0.21790927648544312,
"kl": 0.001029879879206419,
"learning_rate": 1.0239940674851943e-06,
"loss": 0.0,
"reward": 1.454481029510498,
"reward_std": 0.7960291028022766,
"rewards/concensus_correctness_reward_func": 0.05,
"rewards/consensus_reward_func": 0.1,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.2,
"rewards/question_recreation_reward_func": 0.46478102207183836,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.025,
"rewards/xmlcount_reward_func": 0.6146999955177307,
"step": 72
},
{
"completion_length": 258.78125,
"epoch": 9.275862068965518,
"grad_norm": 0.9624819755554199,
"kl": 0.0016905390948522836,
"learning_rate": 8.964794509221508e-07,
"loss": 0.0,
"reward": 1.8803276717662811,
"reward_std": 1.6410220339894295,
"rewards/concensus_correctness_reward_func": 0.18275000154972076,
"rewards/consensus_reward_func": 0.125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 0.5239214114844799,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.720531240105629,
"step": 74
},
{
"completion_length": 320.03125,
"epoch": 9.551724137931034,
"grad_norm": 0.7233691811561584,
"kl": 0.0010870415571844205,
"learning_rate": 7.756905568047393e-07,
"loss": 0.0,
"reward": 1.6069397777318954,
"reward_std": 1.0762585289776325,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 0.5746584795415401,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.641656244173646,
"step": 76
},
{
"completion_length": 288.40625,
"epoch": 9.827586206896552,
"grad_norm": 0.6633221507072449,
"kl": 0.0014429407165152952,
"learning_rate": 6.621340157319998e-07,
"loss": 0.0,
"reward": 1.9194584637880325,
"reward_std": 1.9881719145923853,
"rewards/concensus_correctness_reward_func": 0.640625,
"rewards/consensus_reward_func": 0.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 0.46661476604640484,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.5465937443077564,
"step": 78
},
{
"completion_length": 348.0,
"epoch": 10.0,
"grad_norm": 0.4076433479785919,
"kl": 0.001379342400468886,
"learning_rate": 5.562861233008774e-07,
"loss": 0.0,
"reward": 0.835002475976944,
"reward_std": 0.8312704622745514,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.3,
"rewards/question_recreation_reward_func": 0.41810246780514715,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.11689999997615814,
"step": 80
},
{
"completion_length": 284.5625,
"epoch": 10.275862068965518,
"grad_norm": 13.410774230957031,
"kl": 0.002484311757143587,
"learning_rate": 4.5859084235697236e-07,
"loss": 0.0,
"reward": 1.334033541381359,
"reward_std": 1.2119258306920528,
"rewards/concensus_correctness_reward_func": 0.12262500077486038,
"rewards/consensus_reward_func": 0.0625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 0.591752303764224,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.03125,
"rewards/xmlcount_reward_func": 0.4009062615223229,
"step": 82
},
{
"completion_length": 283.3125,
"epoch": 10.551724137931034,
"grad_norm": 0.5552597641944885,
"kl": 0.0015608746980433352,
"learning_rate": 3.6945794086007706e-07,
"loss": 0.0,
"reward": 1.3054824657738209,
"reward_std": 0.9629417397081852,
"rewards/concensus_correctness_reward_func": 0.08137500286102295,
"rewards/consensus_reward_func": 0.0625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.1875,
"rewards/question_recreation_reward_func": 0.5141699481755495,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.4443124867975712,
"step": 84
},
{
"completion_length": 349.125,
"epoch": 10.827586206896552,
"grad_norm": 0.5144564509391785,
"kl": 0.0014357663603732362,
"learning_rate": 2.892612731749414e-07,
"loss": 0.0,
"reward": 2.4467398822307587,
"reward_std": 2.3171303123235703,
"rewards/concensus_correctness_reward_func": 0.7409999966621399,
"rewards/consensus_reward_func": 0.0625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.5,
"rewards/question_recreation_reward_func": 0.6876149065792561,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.43999999947845936,
"step": 86
},
{
"completion_length": 266.25,
"epoch": 11.0,
"grad_norm": 0.537667453289032,
"kl": 0.0013617533957585692,
"learning_rate": 2.1833721199614992e-07,
"loss": 0.0,
"reward": 1.3557676315307616,
"reward_std": 1.1246003568172456,
"rewards/concensus_correctness_reward_func": 0.09620000123977661,
"rewards/consensus_reward_func": 0.1,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.1,
"rewards/question_recreation_reward_func": 0.37711760550737383,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.025,
"rewards/xmlcount_reward_func": 0.6574500024318695,
"step": 88
},
{
"completion_length": 298.59375,
"epoch": 11.275862068965518,
"grad_norm": 0.7764319181442261,
"kl": 0.0013516054386855103,
"learning_rate": 1.5698323748414123e-07,
"loss": 0.0,
"reward": 1.9731452241539955,
"reward_std": 2.4217655174434185,
"rewards/concensus_correctness_reward_func": 0.8366875015199184,
"rewards/consensus_reward_func": 0.0625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 0.5108639299869537,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.29746875166893005,
"step": 90
},
{
"completion_length": 265.59375,
"epoch": 11.551724137931034,
"grad_norm": 0.7998268008232117,
"kl": 0.0019906353700207546,
"learning_rate": 1.054566895300324e-07,
"loss": 0.0,
"reward": 1.2235557958483696,
"reward_std": 0.7784941829741001,
"rewards/concensus_correctness_reward_func": 0.04337500035762787,
"rewards/consensus_reward_func": 0.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 0.5579620627686381,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.03125,
"rewards/xmlcount_reward_func": 0.4659687466919422,
"step": 92
},
{
"completion_length": 302.46875,
"epoch": 11.827586206896552,
"grad_norm": 0.5803750157356262,
"kl": 0.0012688033384620212,
"learning_rate": 6.397368838268497e-08,
"loss": 0.0,
"reward": 1.6255936734378338,
"reward_std": 1.3891006745398045,
"rewards/concensus_correctness_reward_func": 0.1693749986588955,
"rewards/consensus_reward_func": 0.125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.4375,
"rewards/question_recreation_reward_func": 0.5416874596849084,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.3207812551409006,
"step": 94
},
{
"completion_length": 357.85,
"epoch": 12.0,
"grad_norm": 0.24747301638126373,
"kl": 0.0011559076607227325,
"learning_rate": 3.270822816527325e-08,
"loss": 0.0,
"reward": 1.146351419389248,
"reward_std": 1.193832767009735,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.4,
"rewards/question_recreation_reward_func": 0.5318014442920684,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.025,
"rewards/xmlcount_reward_func": 0.18954999446868898,
"step": 96
},
{
"completion_length": 286.65625,
"epoch": 12.275862068965518,
"grad_norm": 0.8936555981636047,
"kl": 0.001531564790639095,
"learning_rate": 1.1791447083465136e-08,
"loss": 0.0,
"reward": 2.939312696456909,
"reward_std": 2.551104363054037,
"rewards/concensus_correctness_reward_func": 1.25,
"rewards/consensus_reward_func": 0.25,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.5,
"rewards/question_recreation_reward_func": 0.48412513616494834,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.4551875004544854,
"step": 98
},
{
"completion_length": 366.78125,
"epoch": 12.551724137931034,
"grad_norm": 0.5418797135353088,
"kl": 0.001285279548028484,
"learning_rate": 1.3110773862126669e-09,
"loss": 0.0,
"reward": 1.3908307328820229,
"reward_std": 1.0579438470304012,
"rewards/concensus_correctness_reward_func": 0.0,
"rewards/consensus_reward_func": 0.0625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 0.6382682472467422,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.4244375005364418,
"step": 100
},
{
"epoch": 12.551724137931034,
"step": 100,
"total_flos": 0.0,
"train_loss": 1.1221799382710174e-06,
"train_runtime": 1585.3495,
"train_samples_per_second": 1.009,
"train_steps_per_second": 0.063
}
],
"logging_steps": 2,
"max_steps": 100,
"num_input_tokens_seen": 0,
"num_train_epochs": 15,
"save_steps": 25,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}