|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 12.551724137931034, |
|
"eval_steps": 500, |
|
"global_step": 100, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 261.9375, |
|
"epoch": 0.27586206896551724, |
|
"grad_norm": 0.5876455307006836, |
|
"kl": 0.0, |
|
"learning_rate": 1.6666666666666667e-06, |
|
"loss": 0.0, |
|
"reward": 1.4498194679617882, |
|
"reward_std": 1.1243495009839535, |
|
"rewards/concensus_correctness_reward_func": 0.11999999731779099, |
|
"rewards/consensus_reward_func": 0.0625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.25, |
|
"rewards/question_recreation_reward_func": 0.37772573810070753, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.592718742787838, |
|
"step": 2 |
|
}, |
|
{ |
|
"completion_length": 279.15625, |
|
"epoch": 0.5517241379310345, |
|
"grad_norm": 0.7141730785369873, |
|
"kl": 0.0020650627120630816, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0, |
|
"reward": 1.2884040176868439, |
|
"reward_std": 1.1638742461800575, |
|
"rewards/concensus_correctness_reward_func": 0.12012499943375587, |
|
"rewards/consensus_reward_func": 0.125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.125, |
|
"rewards/question_recreation_reward_func": 0.5039352774620056, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.03125, |
|
"rewards/xmlcount_reward_func": 0.3830937538295984, |
|
"step": 4 |
|
}, |
|
{ |
|
"completion_length": 325.8125, |
|
"epoch": 0.8275862068965517, |
|
"grad_norm": 0.5803059935569763, |
|
"kl": 0.000953761518758256, |
|
"learning_rate": 4.99475706559428e-06, |
|
"loss": 0.0, |
|
"reward": 1.294271882623434, |
|
"reward_std": 0.8707563504576683, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.0, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.3125, |
|
"rewards/question_recreation_reward_func": 0.5154593642801046, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.4194375015795231, |
|
"step": 6 |
|
}, |
|
{ |
|
"completion_length": 361.75, |
|
"epoch": 1.0, |
|
"grad_norm": 0.20475846529006958, |
|
"kl": 0.0010331896017305553, |
|
"learning_rate": 4.979050253066064e-06, |
|
"loss": 0.0, |
|
"reward": 1.6736069202423096, |
|
"reward_std": 1.0094650149345399, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.0, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.5, |
|
"rewards/question_recreation_reward_func": 0.552256902679801, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.05, |
|
"rewards/xmlcount_reward_func": 0.5713500022888184, |
|
"step": 8 |
|
}, |
|
{ |
|
"completion_length": 384.46875, |
|
"epoch": 1.2758620689655173, |
|
"grad_norm": 0.43686386942863464, |
|
"kl": 0.0012712998868664727, |
|
"learning_rate": 4.952945442245598e-06, |
|
"loss": 0.0, |
|
"reward": 1.1086738537997007, |
|
"reward_std": 0.8807330783456564, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.0625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.1875, |
|
"rewards/question_recreation_reward_func": 0.4103300729766488, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.4483437407761812, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 347.28125, |
|
"epoch": 1.5517241379310345, |
|
"grad_norm": 0.6117482781410217, |
|
"kl": 0.0009661500589572825, |
|
"learning_rate": 4.916552125781529e-06, |
|
"loss": 0.0, |
|
"reward": 1.2644198425114155, |
|
"reward_std": 0.891217265278101, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.0, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.25, |
|
"rewards/question_recreation_reward_func": 0.5375448148697615, |
|
"rewards/soft_format_reward_func": 0.03125, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.3987499997019768, |
|
"step": 12 |
|
}, |
|
{ |
|
"completion_length": 249.9375, |
|
"epoch": 1.8275862068965516, |
|
"grad_norm": 0.6757048964500427, |
|
"kl": 0.0011229460214963183, |
|
"learning_rate": 4.870022949890676e-06, |
|
"loss": 0.0, |
|
"reward": 1.8286586850881577, |
|
"reward_std": 1.1331563144922256, |
|
"rewards/concensus_correctness_reward_func": 0.0625, |
|
"rewards/consensus_reward_func": 0.0625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.5625, |
|
"rewards/question_recreation_reward_func": 0.5068149194121361, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.078125, |
|
"rewards/xmlcount_reward_func": 0.5562187545001507, |
|
"step": 14 |
|
}, |
|
{ |
|
"completion_length": 275.35, |
|
"epoch": 2.0, |
|
"grad_norm": 0.31559568643569946, |
|
"kl": 0.00098627534462139, |
|
"learning_rate": 4.813553074106761e-06, |
|
"loss": 0.0, |
|
"reward": 2.0888511657714846, |
|
"reward_std": 1.4333394169807434, |
|
"rewards/concensus_correctness_reward_func": 0.13020000457763672, |
|
"rewards/consensus_reward_func": 0.0, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.6, |
|
"rewards/question_recreation_reward_func": 0.7344011664390564, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.05, |
|
"rewards/xmlcount_reward_func": 0.5742499828338623, |
|
"step": 16 |
|
}, |
|
{ |
|
"completion_length": 302.5, |
|
"epoch": 2.2758620689655173, |
|
"grad_norm": 0.6794589161872864, |
|
"kl": 0.0009313603804912418, |
|
"learning_rate": 4.747379352713489e-06, |
|
"loss": 0.0, |
|
"reward": 2.012173980474472, |
|
"reward_std": 1.3338787257671356, |
|
"rewards/concensus_correctness_reward_func": 0.10331249982118607, |
|
"rewards/consensus_reward_func": 0.125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.8125, |
|
"rewards/question_recreation_reward_func": 0.5153615120798349, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.4559999890625477, |
|
"step": 18 |
|
}, |
|
{ |
|
"completion_length": 259.78125, |
|
"epoch": 2.5517241379310347, |
|
"grad_norm": 0.7855917811393738, |
|
"kl": 0.0010634321151883341, |
|
"learning_rate": 4.671779341295378e-06, |
|
"loss": 0.0, |
|
"reward": 1.7313391268253326, |
|
"reward_std": 1.3928996622562408, |
|
"rewards/concensus_correctness_reward_func": 0.26087500154972076, |
|
"rewards/consensus_reward_func": 0.125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.25, |
|
"rewards/question_recreation_reward_func": 0.5740891546010971, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.45887498930096626, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 266.65625, |
|
"epoch": 2.8275862068965516, |
|
"grad_norm": 0.746308445930481, |
|
"kl": 0.0011307917884550989, |
|
"learning_rate": 4.587070132573178e-06, |
|
"loss": 0.0, |
|
"reward": 1.402635745704174, |
|
"reward_std": 0.9156965278089046, |
|
"rewards/concensus_correctness_reward_func": 0.015625, |
|
"rewards/consensus_reward_func": 0.125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.1875, |
|
"rewards/question_recreation_reward_func": 0.4964794850675389, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.03125, |
|
"rewards/xmlcount_reward_func": 0.5467812465503812, |
|
"step": 22 |
|
}, |
|
{ |
|
"completion_length": 327.05, |
|
"epoch": 3.0, |
|
"grad_norm": 0.23480382561683655, |
|
"kl": 0.00111085653770715, |
|
"learning_rate": 4.493607026406802e-06, |
|
"loss": 0.0, |
|
"reward": 1.1529444456100464, |
|
"reward_std": 0.523932871222496, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.2, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.5830944389104843, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.025, |
|
"rewards/xmlcount_reward_func": 0.34484999179840087, |
|
"step": 24 |
|
}, |
|
{ |
|
"completion_length": 336.8125, |
|
"epoch": 3.2758620689655173, |
|
"grad_norm": 0.8294694423675537, |
|
"kl": 0.0010076407925225794, |
|
"learning_rate": 4.391782039544239e-06, |
|
"loss": 0.0, |
|
"reward": 1.3900708109140396, |
|
"reward_std": 1.3332865573465824, |
|
"rewards/concensus_correctness_reward_func": 0.06012500077486038, |
|
"rewards/consensus_reward_func": 0.1875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.1875, |
|
"rewards/question_recreation_reward_func": 0.46316459868103266, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.4761562645435333, |
|
"step": 26 |
|
}, |
|
{ |
|
"completion_length": 353.28125, |
|
"epoch": 3.5517241379310347, |
|
"grad_norm": 0.6492119431495667, |
|
"kl": 0.0011136386019643396, |
|
"learning_rate": 4.282022261367074e-06, |
|
"loss": 0.0, |
|
"reward": 1.7649768702685833, |
|
"reward_std": 1.058425359427929, |
|
"rewards/concensus_correctness_reward_func": 0.015625, |
|
"rewards/consensus_reward_func": 0.0, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.625, |
|
"rewards/question_recreation_reward_func": 0.6061330996453762, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.5025937429745682, |
|
"step": 28 |
|
}, |
|
{ |
|
"completion_length": 281.6875, |
|
"epoch": 3.8275862068965516, |
|
"grad_norm": 0.8953530192375183, |
|
"kl": 0.001063827752659563, |
|
"learning_rate": 4.164788062529203e-06, |
|
"loss": 0.0, |
|
"reward": 2.2271154075860977, |
|
"reward_std": 1.9515271224081516, |
|
"rewards/concensus_correctness_reward_func": 0.640625, |
|
"rewards/consensus_reward_func": 0.0625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.3125, |
|
"rewards/question_recreation_reward_func": 0.57742790132761, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.6184374950826168, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 268.9, |
|
"epoch": 4.0, |
|
"grad_norm": 0.5744255781173706, |
|
"kl": 0.001324763905722648, |
|
"learning_rate": 4.040571164002319e-06, |
|
"loss": 0.0, |
|
"reward": 1.4024577617645264, |
|
"reward_std": 1.0153881907463074, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.0, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.3, |
|
"rewards/question_recreation_reward_func": 0.6460577547550201, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.05, |
|
"rewards/xmlcount_reward_func": 0.4063999891281128, |
|
"step": 32 |
|
}, |
|
{ |
|
"completion_length": 315.125, |
|
"epoch": 4.275862068965517, |
|
"grad_norm": 1.165102243423462, |
|
"kl": 0.000977253686869517, |
|
"learning_rate": 3.909892574627267e-06, |
|
"loss": 0.0, |
|
"reward": 1.1481780782341957, |
|
"reward_std": 1.0175574347376823, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.25, |
|
"rewards/question_recreation_reward_func": 0.3283343203365803, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.4448437672108412, |
|
"step": 34 |
|
}, |
|
{ |
|
"completion_length": 285.3125, |
|
"epoch": 4.551724137931035, |
|
"grad_norm": 0.6390895843505859, |
|
"kl": 0.0010886134841712192, |
|
"learning_rate": 3.773300405821908e-06, |
|
"loss": 0.0, |
|
"reward": 1.2504443675279617, |
|
"reward_std": 1.2421578019857407, |
|
"rewards/concensus_correctness_reward_func": 0.07575000077486038, |
|
"rewards/consensus_reward_func": 0.1875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.1875, |
|
"rewards/question_recreation_reward_func": 0.4656319017522037, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.3340625036507845, |
|
"step": 36 |
|
}, |
|
{ |
|
"completion_length": 261.6875, |
|
"epoch": 4.827586206896552, |
|
"grad_norm": 0.6622844934463501, |
|
"kl": 0.0011051035835407674, |
|
"learning_rate": 3.631367572611348e-06, |
|
"loss": 0.0, |
|
"reward": 1.0794794410467148, |
|
"reward_std": 0.8599886745214462, |
|
"rewards/concensus_correctness_reward_func": 0.0625, |
|
"rewards/consensus_reward_func": 0.0625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.125, |
|
"rewards/question_recreation_reward_func": 0.5531356520950794, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.03125, |
|
"rewards/xmlcount_reward_func": 0.24509375356137753, |
|
"step": 38 |
|
}, |
|
{ |
|
"completion_length": 317.2, |
|
"epoch": 5.0, |
|
"grad_norm": 0.6508445143699646, |
|
"kl": 0.0012014877400361, |
|
"learning_rate": 3.484689390623218e-06, |
|
"loss": 0.0, |
|
"reward": 1.2601001858711243, |
|
"reward_std": 0.9332681000232697, |
|
"rewards/concensus_correctness_reward_func": 0.09599999785423279, |
|
"rewards/consensus_reward_func": 0.1, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.5755501747131347, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.025, |
|
"rewards/xmlcount_reward_func": 0.46354998350143434, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 348.0625, |
|
"epoch": 5.275862068965517, |
|
"grad_norm": 0.4629547595977783, |
|
"kl": 0.0010141393868252635, |
|
"learning_rate": 3.333881079127052e-06, |
|
"loss": 0.0, |
|
"reward": 1.3980103582143784, |
|
"reward_std": 1.1040666736662388, |
|
"rewards/concensus_correctness_reward_func": 0.13574999943375587, |
|
"rewards/consensus_reward_func": 0.0625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.125, |
|
"rewards/question_recreation_reward_func": 0.6644478440284729, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.03125, |
|
"rewards/xmlcount_reward_func": 0.3790625077672303, |
|
"step": 42 |
|
}, |
|
{ |
|
"completion_length": 273.65625, |
|
"epoch": 5.551724137931035, |
|
"grad_norm": 0.7342892289161682, |
|
"kl": 0.0011725125659722835, |
|
"learning_rate": 3.1795751805908578e-06, |
|
"loss": 0.0, |
|
"reward": 2.422099143266678, |
|
"reward_std": 2.033245772123337, |
|
"rewards/concensus_correctness_reward_func": 0.7265625, |
|
"rewards/consensus_reward_func": 0.0625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.625, |
|
"rewards/question_recreation_reward_func": 0.5944116413593292, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.3980000060983002, |
|
"step": 44 |
|
}, |
|
{ |
|
"completion_length": 379.8125, |
|
"epoch": 5.827586206896552, |
|
"grad_norm": 0.652703046798706, |
|
"kl": 0.0010911910067079589, |
|
"learning_rate": 3.0224189075781886e-06, |
|
"loss": 0.0, |
|
"reward": 1.2743461802601814, |
|
"reward_std": 1.3854865245521069, |
|
"rewards/concensus_correctness_reward_func": 0.12262500077486038, |
|
"rewards/consensus_reward_func": 0.125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.1875, |
|
"rewards/question_recreation_reward_func": 0.4171899161301553, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.42203125543892384, |
|
"step": 46 |
|
}, |
|
{ |
|
"completion_length": 318.95, |
|
"epoch": 6.0, |
|
"grad_norm": 0.22338756918907166, |
|
"kl": 0.001001153700053692, |
|
"learning_rate": 2.8630714281137263e-06, |
|
"loss": 0.0, |
|
"reward": 1.5304819822311402, |
|
"reward_std": 1.059793508052826, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.1, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.5, |
|
"rewards/question_recreation_reward_func": 0.38448197543621065, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.025, |
|
"rewards/xmlcount_reward_func": 0.5209999918937683, |
|
"step": 48 |
|
}, |
|
{ |
|
"completion_length": 370.46875, |
|
"epoch": 6.275862068965517, |
|
"grad_norm": 0.6062008738517761, |
|
"kl": 0.0010415500364615582, |
|
"learning_rate": 2.702201100903511e-06, |
|
"loss": 0.0, |
|
"reward": 1.4714525565505028, |
|
"reward_std": 1.0109544545412064, |
|
"rewards/concensus_correctness_reward_func": 0.0755000002682209, |
|
"rewards/consensus_reward_func": 0.125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.3125, |
|
"rewards/question_recreation_reward_func": 0.4757650615647435, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.4826874975115061, |
|
"step": 50 |
|
}, |
|
{ |
|
"completion_length": 339.28125, |
|
"epoch": 6.551724137931035, |
|
"grad_norm": 0.6081772446632385, |
|
"kl": 0.000928263645619154, |
|
"learning_rate": 2.5404826720062544e-06, |
|
"loss": 0.0, |
|
"reward": 1.6288059502840042, |
|
"reward_std": 1.1489972844719887, |
|
"rewards/concensus_correctness_reward_func": 0.03125, |
|
"rewards/consensus_reward_func": 0.125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.5625, |
|
"rewards/question_recreation_reward_func": 0.5305872187018394, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.3794687408953905, |
|
"step": 52 |
|
}, |
|
{ |
|
"completion_length": 294.8125, |
|
"epoch": 6.827586206896552, |
|
"grad_norm": 0.7885801196098328, |
|
"kl": 0.0011745219817385077, |
|
"learning_rate": 2.3785944447138804e-06, |
|
"loss": 0.0, |
|
"reward": 1.3874307684600353, |
|
"reward_std": 1.1232963781803846, |
|
"rewards/concensus_correctness_reward_func": 0.12262500077486038, |
|
"rewards/consensus_reward_func": 0.0625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.1875, |
|
"rewards/question_recreation_reward_func": 0.5388057269155979, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.42912499886006117, |
|
"step": 54 |
|
}, |
|
{ |
|
"completion_length": 335.2, |
|
"epoch": 7.0, |
|
"grad_norm": 0.35713574290275574, |
|
"kl": 0.0010486021172255277, |
|
"learning_rate": 2.2172154345117896e-06, |
|
"loss": 0.0, |
|
"reward": 0.9908310234546661, |
|
"reward_std": 0.8158440917730332, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.2, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.2, |
|
"rewards/question_recreation_reward_func": 0.24703103601932525, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.025, |
|
"rewards/xmlcount_reward_func": 0.31880000084638593, |
|
"step": 56 |
|
}, |
|
{ |
|
"completion_length": 269.875, |
|
"epoch": 7.275862068965517, |
|
"grad_norm": 0.7617030143737793, |
|
"kl": 0.0012455169853637926, |
|
"learning_rate": 2.0570225210519433e-06, |
|
"loss": 0.0, |
|
"reward": 1.2766155302524567, |
|
"reward_std": 0.9013489130884409, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.0625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.125, |
|
"rewards/question_recreation_reward_func": 0.498896773904562, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.5433437526226044, |
|
"step": 58 |
|
}, |
|
{ |
|
"completion_length": 359.75, |
|
"epoch": 7.551724137931035, |
|
"grad_norm": 0.4766957759857178, |
|
"kl": 0.0010567399804131128, |
|
"learning_rate": 1.8986876090843668e-06, |
|
"loss": 0.0, |
|
"reward": 1.335220292210579, |
|
"reward_std": 1.233029380440712, |
|
"rewards/concensus_correctness_reward_func": 0.03125, |
|
"rewards/consensus_reward_func": 0.125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.3125, |
|
"rewards/question_recreation_reward_func": 0.4641265389509499, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.4023437686264515, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 317.21875, |
|
"epoch": 7.827586206896552, |
|
"grad_norm": 0.8781487941741943, |
|
"kl": 0.0012130630420870148, |
|
"learning_rate": 1.7428748102551237e-06, |
|
"loss": 0.0, |
|
"reward": 1.2447976544499397, |
|
"reward_std": 0.9147735517472029, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.0, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.25, |
|
"rewards/question_recreation_reward_func": 0.5317039042711258, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.03125, |
|
"rewards/xmlcount_reward_func": 0.4318437557667494, |
|
"step": 62 |
|
}, |
|
{ |
|
"completion_length": 291.95, |
|
"epoch": 8.0, |
|
"grad_norm": 0.44187262654304504, |
|
"kl": 0.0014347493182867766, |
|
"learning_rate": 1.5902376575912815e-06, |
|
"loss": 0.0, |
|
"reward": 3.6168224096298216, |
|
"reward_std": 2.845296561717987, |
|
"rewards/concensus_correctness_reward_func": 2.0, |
|
"rewards/consensus_reward_func": 0.3, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.3, |
|
"rewards/question_recreation_reward_func": 0.3940222471952438, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.05, |
|
"rewards/xmlcount_reward_func": 0.5728000104427338, |
|
"step": 64 |
|
}, |
|
{ |
|
"completion_length": 306.9375, |
|
"epoch": 8.275862068965518, |
|
"grad_norm": 1.304425835609436, |
|
"kl": 0.0012883242161478847, |
|
"learning_rate": 1.4414163643562755e-06, |
|
"loss": 0.0, |
|
"reward": 1.414997085928917, |
|
"reward_std": 0.9384825639426708, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.1875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.1875, |
|
"rewards/question_recreation_reward_func": 0.5457158237695694, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.03125, |
|
"rewards/xmlcount_reward_func": 0.4630312602967024, |
|
"step": 66 |
|
}, |
|
{ |
|
"completion_length": 275.09375, |
|
"epoch": 8.551724137931034, |
|
"grad_norm": 0.7772114872932434, |
|
"kl": 0.0014374244710779749, |
|
"learning_rate": 1.2970351387729875e-06, |
|
"loss": 0.0, |
|
"reward": 1.7658802941441536, |
|
"reward_std": 1.1716703101992607, |
|
"rewards/concensus_correctness_reward_func": 0.11599999666213989, |
|
"rewards/consensus_reward_func": 0.0, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.5625, |
|
"rewards/question_recreation_reward_func": 0.5250677652657032, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.49981250055134296, |
|
"step": 68 |
|
}, |
|
{ |
|
"completion_length": 330.53125, |
|
"epoch": 8.827586206896552, |
|
"grad_norm": 0.8543239235877991, |
|
"kl": 0.0013485400995705277, |
|
"learning_rate": 1.1576995658775405e-06, |
|
"loss": 0.0, |
|
"reward": 1.2814906537532806, |
|
"reward_std": 1.0425735749304295, |
|
"rewards/concensus_correctness_reward_func": 0.09137500077486038, |
|
"rewards/consensus_reward_func": 0.0625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.3125, |
|
"rewards/question_recreation_reward_func": 0.4742719018831849, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.03125, |
|
"rewards/xmlcount_reward_func": 0.30959376133978367, |
|
"step": 70 |
|
}, |
|
{ |
|
"completion_length": 403.5, |
|
"epoch": 9.0, |
|
"grad_norm": 0.21790927648544312, |
|
"kl": 0.001029879879206419, |
|
"learning_rate": 1.0239940674851943e-06, |
|
"loss": 0.0, |
|
"reward": 1.454481029510498, |
|
"reward_std": 0.7960291028022766, |
|
"rewards/concensus_correctness_reward_func": 0.05, |
|
"rewards/consensus_reward_func": 0.1, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.2, |
|
"rewards/question_recreation_reward_func": 0.46478102207183836, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.025, |
|
"rewards/xmlcount_reward_func": 0.6146999955177307, |
|
"step": 72 |
|
}, |
|
{ |
|
"completion_length": 258.78125, |
|
"epoch": 9.275862068965518, |
|
"grad_norm": 0.9624819755554199, |
|
"kl": 0.0016905390948522836, |
|
"learning_rate": 8.964794509221508e-07, |
|
"loss": 0.0, |
|
"reward": 1.8803276717662811, |
|
"reward_std": 1.6410220339894295, |
|
"rewards/concensus_correctness_reward_func": 0.18275000154972076, |
|
"rewards/consensus_reward_func": 0.125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.25, |
|
"rewards/question_recreation_reward_func": 0.5239214114844799, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.720531240105629, |
|
"step": 74 |
|
}, |
|
{ |
|
"completion_length": 320.03125, |
|
"epoch": 9.551724137931034, |
|
"grad_norm": 0.7233691811561584, |
|
"kl": 0.0010870415571844205, |
|
"learning_rate": 7.756905568047393e-07, |
|
"loss": 0.0, |
|
"reward": 1.6069397777318954, |
|
"reward_std": 1.0762585289776325, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.25, |
|
"rewards/question_recreation_reward_func": 0.5746584795415401, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.641656244173646, |
|
"step": 76 |
|
}, |
|
{ |
|
"completion_length": 288.40625, |
|
"epoch": 9.827586206896552, |
|
"grad_norm": 0.6633221507072449, |
|
"kl": 0.0014429407165152952, |
|
"learning_rate": 6.621340157319998e-07, |
|
"loss": 0.0, |
|
"reward": 1.9194584637880325, |
|
"reward_std": 1.9881719145923853, |
|
"rewards/concensus_correctness_reward_func": 0.640625, |
|
"rewards/consensus_reward_func": 0.0, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.25, |
|
"rewards/question_recreation_reward_func": 0.46661476604640484, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.5465937443077564, |
|
"step": 78 |
|
}, |
|
{ |
|
"completion_length": 348.0, |
|
"epoch": 10.0, |
|
"grad_norm": 0.4076433479785919, |
|
"kl": 0.001379342400468886, |
|
"learning_rate": 5.562861233008774e-07, |
|
"loss": 0.0, |
|
"reward": 0.835002475976944, |
|
"reward_std": 0.8312704622745514, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.0, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.3, |
|
"rewards/question_recreation_reward_func": 0.41810246780514715, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.11689999997615814, |
|
"step": 80 |
|
}, |
|
{ |
|
"completion_length": 284.5625, |
|
"epoch": 10.275862068965518, |
|
"grad_norm": 13.410774230957031, |
|
"kl": 0.002484311757143587, |
|
"learning_rate": 4.5859084235697236e-07, |
|
"loss": 0.0, |
|
"reward": 1.334033541381359, |
|
"reward_std": 1.2119258306920528, |
|
"rewards/concensus_correctness_reward_func": 0.12262500077486038, |
|
"rewards/consensus_reward_func": 0.0625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.125, |
|
"rewards/question_recreation_reward_func": 0.591752303764224, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.03125, |
|
"rewards/xmlcount_reward_func": 0.4009062615223229, |
|
"step": 82 |
|
}, |
|
{ |
|
"completion_length": 283.3125, |
|
"epoch": 10.551724137931034, |
|
"grad_norm": 0.5552597641944885, |
|
"kl": 0.0015608746980433352, |
|
"learning_rate": 3.6945794086007706e-07, |
|
"loss": 0.0, |
|
"reward": 1.3054824657738209, |
|
"reward_std": 0.9629417397081852, |
|
"rewards/concensus_correctness_reward_func": 0.08137500286102295, |
|
"rewards/consensus_reward_func": 0.0625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.1875, |
|
"rewards/question_recreation_reward_func": 0.5141699481755495, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.4443124867975712, |
|
"step": 84 |
|
}, |
|
{ |
|
"completion_length": 349.125, |
|
"epoch": 10.827586206896552, |
|
"grad_norm": 0.5144564509391785, |
|
"kl": 0.0014357663603732362, |
|
"learning_rate": 2.892612731749414e-07, |
|
"loss": 0.0, |
|
"reward": 2.4467398822307587, |
|
"reward_std": 2.3171303123235703, |
|
"rewards/concensus_correctness_reward_func": 0.7409999966621399, |
|
"rewards/consensus_reward_func": 0.0625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.5, |
|
"rewards/question_recreation_reward_func": 0.6876149065792561, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.43999999947845936, |
|
"step": 86 |
|
}, |
|
{ |
|
"completion_length": 266.25, |
|
"epoch": 11.0, |
|
"grad_norm": 0.537667453289032, |
|
"kl": 0.0013617533957585692, |
|
"learning_rate": 2.1833721199614992e-07, |
|
"loss": 0.0, |
|
"reward": 1.3557676315307616, |
|
"reward_std": 1.1246003568172456, |
|
"rewards/concensus_correctness_reward_func": 0.09620000123977661, |
|
"rewards/consensus_reward_func": 0.1, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.1, |
|
"rewards/question_recreation_reward_func": 0.37711760550737383, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.025, |
|
"rewards/xmlcount_reward_func": 0.6574500024318695, |
|
"step": 88 |
|
}, |
|
{ |
|
"completion_length": 298.59375, |
|
"epoch": 11.275862068965518, |
|
"grad_norm": 0.7764319181442261, |
|
"kl": 0.0013516054386855103, |
|
"learning_rate": 1.5698323748414123e-07, |
|
"loss": 0.0, |
|
"reward": 1.9731452241539955, |
|
"reward_std": 2.4217655174434185, |
|
"rewards/concensus_correctness_reward_func": 0.8366875015199184, |
|
"rewards/consensus_reward_func": 0.0625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.25, |
|
"rewards/question_recreation_reward_func": 0.5108639299869537, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.29746875166893005, |
|
"step": 90 |
|
}, |
|
{ |
|
"completion_length": 265.59375, |
|
"epoch": 11.551724137931034, |
|
"grad_norm": 0.7998268008232117, |
|
"kl": 0.0019906353700207546, |
|
"learning_rate": 1.054566895300324e-07, |
|
"loss": 0.0, |
|
"reward": 1.2235557958483696, |
|
"reward_std": 0.7784941829741001, |
|
"rewards/concensus_correctness_reward_func": 0.04337500035762787, |
|
"rewards/consensus_reward_func": 0.0, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.125, |
|
"rewards/question_recreation_reward_func": 0.5579620627686381, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.03125, |
|
"rewards/xmlcount_reward_func": 0.4659687466919422, |
|
"step": 92 |
|
}, |
|
{ |
|
"completion_length": 302.46875, |
|
"epoch": 11.827586206896552, |
|
"grad_norm": 0.5803750157356262, |
|
"kl": 0.0012688033384620212, |
|
"learning_rate": 6.397368838268497e-08, |
|
"loss": 0.0, |
|
"reward": 1.6255936734378338, |
|
"reward_std": 1.3891006745398045, |
|
"rewards/concensus_correctness_reward_func": 0.1693749986588955, |
|
"rewards/consensus_reward_func": 0.125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.4375, |
|
"rewards/question_recreation_reward_func": 0.5416874596849084, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.3207812551409006, |
|
"step": 94 |
|
}, |
|
{ |
|
"completion_length": 357.85, |
|
"epoch": 12.0, |
|
"grad_norm": 0.24747301638126373, |
|
"kl": 0.0011559076607227325, |
|
"learning_rate": 3.270822816527325e-08, |
|
"loss": 0.0, |
|
"reward": 1.146351419389248, |
|
"reward_std": 1.193832767009735, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.0, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.4, |
|
"rewards/question_recreation_reward_func": 0.5318014442920684, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.025, |
|
"rewards/xmlcount_reward_func": 0.18954999446868898, |
|
"step": 96 |
|
}, |
|
{ |
|
"completion_length": 286.65625, |
|
"epoch": 12.275862068965518, |
|
"grad_norm": 0.8936555981636047, |
|
"kl": 0.001531564790639095, |
|
"learning_rate": 1.1791447083465136e-08, |
|
"loss": 0.0, |
|
"reward": 2.939312696456909, |
|
"reward_std": 2.551104363054037, |
|
"rewards/concensus_correctness_reward_func": 1.25, |
|
"rewards/consensus_reward_func": 0.25, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.5, |
|
"rewards/question_recreation_reward_func": 0.48412513616494834, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.4551875004544854, |
|
"step": 98 |
|
}, |
|
{ |
|
"completion_length": 366.78125, |
|
"epoch": 12.551724137931034, |
|
"grad_norm": 0.5418797135353088, |
|
"kl": 0.001285279548028484, |
|
"learning_rate": 1.3110773862126669e-09, |
|
"loss": 0.0, |
|
"reward": 1.3908307328820229, |
|
"reward_std": 1.0579438470304012, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.0625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.25, |
|
"rewards/question_recreation_reward_func": 0.6382682472467422, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.4244375005364418, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 12.551724137931034, |
|
"step": 100, |
|
"total_flos": 0.0, |
|
"train_loss": 1.1221799382710174e-06, |
|
"train_runtime": 1585.3495, |
|
"train_samples_per_second": 1.009, |
|
"train_steps_per_second": 0.063 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 100, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 15, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|