|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.00022658935437895256, |
|
"eval_steps": 500, |
|
"global_step": 100, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0625, |
|
"completions/max_length": 656.0, |
|
"completions/max_terminated_length": 222.5, |
|
"completions/mean_length": 194.0625, |
|
"completions/mean_terminated_length": 136.94643020629883, |
|
"completions/min_length": 61.5, |
|
"completions/min_terminated_length": 61.5, |
|
"epoch": 4.531787087579051e-06, |
|
"frac_reward_zero_std": 0.5, |
|
"grad_norm": 12.525100708007812, |
|
"kl": 0.0, |
|
"learning_rate": 1.6666666666666665e-07, |
|
"loss": -0.0, |
|
"num_tokens": 5377.0, |
|
"reward": 0.0625, |
|
"reward_std": 0.1767766922712326, |
|
"rewards/equation_reward_func/mean": 0.0, |
|
"rewards/equation_reward_func/std": 0.0, |
|
"rewards/format_reward_func/mean": 0.0625, |
|
"rewards/format_reward_func/std": 0.1767766922712326, |
|
"step": 2 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0625, |
|
"completions/max_length": 618.0, |
|
"completions/max_terminated_length": 198.0, |
|
"completions/mean_length": 171.25, |
|
"completions/mean_terminated_length": 114.93750381469727, |
|
"completions/min_length": 57.5, |
|
"completions/min_terminated_length": 57.5, |
|
"epoch": 9.063574175158102e-06, |
|
"frac_reward_zero_std": 0.5, |
|
"grad_norm": 0.001167318900115788, |
|
"kl": 0.0009191570134134963, |
|
"learning_rate": 5e-07, |
|
"loss": 0.0, |
|
"num_tokens": 10397.0, |
|
"reward": 0.0625, |
|
"reward_std": 0.1767766922712326, |
|
"rewards/equation_reward_func/mean": 0.0, |
|
"rewards/equation_reward_func/std": 0.0, |
|
"rewards/format_reward_func/mean": 0.0625, |
|
"rewards/format_reward_func/std": 0.1767766922712326, |
|
"step": 4 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 477.0, |
|
"completions/max_terminated_length": 477.0, |
|
"completions/mean_length": 175.5, |
|
"completions/mean_terminated_length": 175.5, |
|
"completions/min_length": 38.5, |
|
"completions/min_terminated_length": 38.5, |
|
"epoch": 1.3595361262737154e-05, |
|
"frac_reward_zero_std": 1.0, |
|
"grad_norm": 0.0017322486964985728, |
|
"kl": 0.0009386140663991682, |
|
"learning_rate": 4.994757065594279e-07, |
|
"loss": 0.0, |
|
"num_tokens": 15493.0, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/equation_reward_func/mean": 0.0, |
|
"rewards/equation_reward_func/std": 0.0, |
|
"rewards/format_reward_func/mean": 0.0, |
|
"rewards/format_reward_func/std": 0.0, |
|
"step": 6 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 307.0, |
|
"completions/max_terminated_length": 307.0, |
|
"completions/mean_length": 156.25, |
|
"completions/mean_terminated_length": 156.25, |
|
"completions/min_length": 47.0, |
|
"completions/min_terminated_length": 47.0, |
|
"epoch": 1.8127148350316204e-05, |
|
"frac_reward_zero_std": 0.5, |
|
"grad_norm": 11.28031063079834, |
|
"kl": 0.0007646345866305637, |
|
"learning_rate": 4.979050253066063e-07, |
|
"loss": 0.0, |
|
"num_tokens": 20313.0, |
|
"reward": 0.0625, |
|
"reward_std": 0.1767766922712326, |
|
"rewards/equation_reward_func/mean": 0.0, |
|
"rewards/equation_reward_func/std": 0.0, |
|
"rewards/format_reward_func/mean": 0.0625, |
|
"rewards/format_reward_func/std": 0.1767766922712326, |
|
"step": 8 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 312.5, |
|
"completions/max_terminated_length": 312.5, |
|
"completions/mean_length": 152.25, |
|
"completions/mean_terminated_length": 152.25, |
|
"completions/min_length": 65.0, |
|
"completions/min_terminated_length": 65.0, |
|
"epoch": 2.2658935437895258e-05, |
|
"frac_reward_zero_std": 0.5, |
|
"grad_norm": 9.902769088745117, |
|
"kl": 0.0008557607743568951, |
|
"learning_rate": 4.952945442245597e-07, |
|
"loss": 0.0, |
|
"num_tokens": 25061.0, |
|
"reward": 0.0625, |
|
"reward_std": 0.1767766922712326, |
|
"rewards/equation_reward_func/mean": 0.0, |
|
"rewards/equation_reward_func/std": 0.0, |
|
"rewards/format_reward_func/mean": 0.0625, |
|
"rewards/format_reward_func/std": 0.1767766922712326, |
|
"step": 10 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 543.5, |
|
"completions/max_terminated_length": 543.5, |
|
"completions/mean_length": 164.4375, |
|
"completions/mean_terminated_length": 164.4375, |
|
"completions/min_length": 38.0, |
|
"completions/min_terminated_length": 38.0, |
|
"epoch": 2.7190722525474308e-05, |
|
"frac_reward_zero_std": 0.5, |
|
"grad_norm": 0.0027156081050634384, |
|
"kl": 0.0010605865281831939, |
|
"learning_rate": 4.916552125781528e-07, |
|
"loss": 0.0, |
|
"num_tokens": 29980.0, |
|
"reward": 0.0625, |
|
"reward_std": 0.1767766922712326, |
|
"rewards/equation_reward_func/mean": 0.0, |
|
"rewards/equation_reward_func/std": 0.0, |
|
"rewards/format_reward_func/mean": 0.0625, |
|
"rewards/format_reward_func/std": 0.1767766922712326, |
|
"step": 12 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 451.0, |
|
"completions/max_terminated_length": 451.0, |
|
"completions/mean_length": 211.0, |
|
"completions/mean_terminated_length": 211.0, |
|
"completions/min_length": 70.5, |
|
"completions/min_terminated_length": 70.5, |
|
"epoch": 3.172250961305336e-05, |
|
"frac_reward_zero_std": 0.5, |
|
"grad_norm": 12.591176986694336, |
|
"kl": 0.0009377936348755611, |
|
"learning_rate": 4.870022949890676e-07, |
|
"loss": 0.0, |
|
"num_tokens": 35676.0, |
|
"reward": 0.0625, |
|
"reward_std": 0.1767766922712326, |
|
"rewards/equation_reward_func/mean": 0.0, |
|
"rewards/equation_reward_func/std": 0.0, |
|
"rewards/format_reward_func/mean": 0.0625, |
|
"rewards/format_reward_func/std": 0.1767766922712326, |
|
"step": 14 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 465.5, |
|
"completions/max_terminated_length": 465.5, |
|
"completions/mean_length": 163.375, |
|
"completions/mean_terminated_length": 163.375, |
|
"completions/min_length": 49.0, |
|
"completions/min_terminated_length": 49.0, |
|
"epoch": 3.625429670063241e-05, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 10.893583297729492, |
|
"kl": 0.001199566273498931, |
|
"learning_rate": 4.81355307410676e-07, |
|
"loss": 0.0, |
|
"num_tokens": 40570.0, |
|
"reward": 0.125, |
|
"reward_std": 0.3535533845424652, |
|
"rewards/equation_reward_func/mean": 0.0, |
|
"rewards/equation_reward_func/std": 0.0, |
|
"rewards/format_reward_func/mean": 0.125, |
|
"rewards/format_reward_func/std": 0.3535533845424652, |
|
"step": 16 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 401.0, |
|
"completions/max_terminated_length": 401.0, |
|
"completions/mean_length": 209.75, |
|
"completions/mean_terminated_length": 209.75, |
|
"completions/min_length": 63.0, |
|
"completions/min_terminated_length": 63.0, |
|
"epoch": 4.078608378821146e-05, |
|
"frac_reward_zero_std": 1.0, |
|
"grad_norm": 0.0009183982037939131, |
|
"kl": 0.0008469254862575326, |
|
"learning_rate": 4.747379352713488e-07, |
|
"loss": 0.0, |
|
"num_tokens": 46174.0, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/equation_reward_func/mean": 0.0, |
|
"rewards/equation_reward_func/std": 0.0, |
|
"rewards/format_reward_func/mean": 0.0, |
|
"rewards/format_reward_func/std": 0.0, |
|
"step": 18 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 583.0, |
|
"completions/max_terminated_length": 583.0, |
|
"completions/mean_length": 211.125, |
|
"completions/mean_terminated_length": 211.125, |
|
"completions/min_length": 69.0, |
|
"completions/min_terminated_length": 69.0, |
|
"epoch": 4.5317870875790515e-05, |
|
"frac_reward_zero_std": 1.0, |
|
"grad_norm": 0.0012323512928560376, |
|
"kl": 0.0012486951891332865, |
|
"learning_rate": 4.6717793412953776e-07, |
|
"loss": 0.0, |
|
"num_tokens": 51832.0, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/equation_reward_func/mean": 0.0, |
|
"rewards/equation_reward_func/std": 0.0, |
|
"rewards/format_reward_func/mean": 0.0, |
|
"rewards/format_reward_func/std": 0.0, |
|
"step": 20 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0625, |
|
"completions/max_length": 620.5, |
|
"completions/max_terminated_length": 285.5, |
|
"completions/mean_length": 202.9375, |
|
"completions/mean_terminated_length": 150.90178680419922, |
|
"completions/min_length": 60.5, |
|
"completions/min_terminated_length": 60.5, |
|
"epoch": 4.984965796336956e-05, |
|
"frac_reward_zero_std": 1.0, |
|
"grad_norm": 0.0016945754177868366, |
|
"kl": 0.00123220352907083, |
|
"learning_rate": 4.5870701325731773e-07, |
|
"loss": 0.0, |
|
"num_tokens": 57327.0, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/equation_reward_func/mean": 0.0, |
|
"rewards/equation_reward_func/std": 0.0, |
|
"rewards/format_reward_func/mean": 0.0, |
|
"rewards/format_reward_func/std": 0.0, |
|
"step": 22 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0625, |
|
"completions/max_length": 683.0, |
|
"completions/max_terminated_length": 434.5, |
|
"completions/mean_length": 257.0, |
|
"completions/mean_terminated_length": 209.44644165039062, |
|
"completions/min_length": 84.5, |
|
"completions/min_terminated_length": 84.5, |
|
"epoch": 5.4381445050948616e-05, |
|
"frac_reward_zero_std": 1.0, |
|
"grad_norm": 0.00123355642426759, |
|
"kl": 0.0011914248134416994, |
|
"learning_rate": 4.4936070264068016e-07, |
|
"loss": 0.0, |
|
"num_tokens": 63719.0, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/equation_reward_func/mean": 0.0, |
|
"rewards/equation_reward_func/std": 0.0, |
|
"rewards/format_reward_func/mean": 0.0, |
|
"rewards/format_reward_func/std": 0.0, |
|
"step": 24 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0625, |
|
"completions/max_length": 641.0, |
|
"completions/max_terminated_length": 235.5, |
|
"completions/mean_length": 181.3125, |
|
"completions/mean_terminated_length": 125.05357360839844, |
|
"completions/min_length": 35.0, |
|
"completions/min_terminated_length": 35.0, |
|
"epoch": 5.891323213852767e-05, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 13.299399375915527, |
|
"kl": 0.0018970294222526718, |
|
"learning_rate": 4.391782039544238e-07, |
|
"loss": 0.0, |
|
"num_tokens": 68924.0, |
|
"reward": 0.125, |
|
"reward_std": 0.3535533845424652, |
|
"rewards/equation_reward_func/mean": 0.0, |
|
"rewards/equation_reward_func/std": 0.0, |
|
"rewards/format_reward_func/mean": 0.125, |
|
"rewards/format_reward_func/std": 0.3535533845424652, |
|
"step": 26 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 214.5, |
|
"completions/max_terminated_length": 214.5, |
|
"completions/mean_length": 108.6875, |
|
"completions/mean_terminated_length": 108.6875, |
|
"completions/min_length": 34.0, |
|
"completions/min_terminated_length": 34.0, |
|
"epoch": 6.344501922610672e-05, |
|
"frac_reward_zero_std": 0.5, |
|
"grad_norm": 0.0035000136122107506, |
|
"kl": 0.0019113743601337774, |
|
"learning_rate": 4.282022261367073e-07, |
|
"loss": 0.0, |
|
"num_tokens": 72919.0, |
|
"reward": 0.0625, |
|
"reward_std": 0.1767766922712326, |
|
"rewards/equation_reward_func/mean": 0.0, |
|
"rewards/equation_reward_func/std": 0.0, |
|
"rewards/format_reward_func/mean": 0.0625, |
|
"rewards/format_reward_func/std": 0.1767766922712326, |
|
"step": 28 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 583.5, |
|
"completions/max_terminated_length": 583.5, |
|
"completions/mean_length": 217.625, |
|
"completions/mean_terminated_length": 217.625, |
|
"completions/min_length": 37.0, |
|
"completions/min_terminated_length": 37.0, |
|
"epoch": 6.797680631368577e-05, |
|
"frac_reward_zero_std": 1.0, |
|
"grad_norm": 0.0021501195151358843, |
|
"kl": 0.0015104188605619129, |
|
"learning_rate": 4.1647880625292027e-07, |
|
"loss": 0.0, |
|
"num_tokens": 78713.0, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/equation_reward_func/mean": 0.0, |
|
"rewards/equation_reward_func/std": 0.0, |
|
"rewards/format_reward_func/mean": 0.0, |
|
"rewards/format_reward_func/std": 0.0, |
|
"step": 30 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 335.5, |
|
"completions/max_terminated_length": 335.5, |
|
"completions/mean_length": 134.75, |
|
"completions/mean_terminated_length": 134.75, |
|
"completions/min_length": 32.0, |
|
"completions/min_terminated_length": 32.0, |
|
"epoch": 7.250859340126482e-05, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 14.151634216308594, |
|
"kl": 0.0026571638591121882, |
|
"learning_rate": 4.040571164002318e-07, |
|
"loss": 0.0, |
|
"num_tokens": 83149.0, |
|
"reward": 0.1875, |
|
"reward_std": 0.408231720328331, |
|
"rewards/equation_reward_func/mean": 0.0, |
|
"rewards/equation_reward_func/std": 0.0, |
|
"rewards/format_reward_func/mean": 0.1875, |
|
"rewards/format_reward_func/std": 0.408231720328331, |
|
"step": 32 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 381.5, |
|
"completions/max_terminated_length": 381.5, |
|
"completions/mean_length": 190.625, |
|
"completions/mean_terminated_length": 190.625, |
|
"completions/min_length": 34.5, |
|
"completions/min_terminated_length": 34.5, |
|
"epoch": 7.704038048884388e-05, |
|
"frac_reward_zero_std": 1.0, |
|
"grad_norm": 0.006240386515855789, |
|
"kl": 0.0022764305977034383, |
|
"learning_rate": 3.909892574627266e-07, |
|
"loss": 0.0, |
|
"num_tokens": 88487.0, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/equation_reward_func/mean": 0.0, |
|
"rewards/equation_reward_func/std": 0.0, |
|
"rewards/format_reward_func/mean": 0.0, |
|
"rewards/format_reward_func/std": 0.0, |
|
"step": 34 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 745.0, |
|
"completions/max_terminated_length": 745.0, |
|
"completions/mean_length": 219.75, |
|
"completions/mean_terminated_length": 219.75, |
|
"completions/min_length": 67.5, |
|
"completions/min_terminated_length": 67.5, |
|
"epoch": 8.157216757642292e-05, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 16.2641544342041, |
|
"kl": 0.002281889770529233, |
|
"learning_rate": 3.773300405821908e-07, |
|
"loss": 0.0, |
|
"num_tokens": 94251.0, |
|
"reward": 0.125, |
|
"reward_std": 0.3535533845424652, |
|
"rewards/equation_reward_func/mean": 0.0, |
|
"rewards/equation_reward_func/std": 0.0, |
|
"rewards/format_reward_func/mean": 0.125, |
|
"rewards/format_reward_func/std": 0.3535533845424652, |
|
"step": 36 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 436.0, |
|
"completions/max_terminated_length": 436.0, |
|
"completions/mean_length": 172.5, |
|
"completions/mean_terminated_length": 172.5, |
|
"completions/min_length": 66.0, |
|
"completions/min_terminated_length": 66.0, |
|
"epoch": 8.610395466400197e-05, |
|
"frac_reward_zero_std": 0.5, |
|
"grad_norm": 15.585210800170898, |
|
"kl": 0.010958031325571937, |
|
"learning_rate": 3.6313675726113475e-07, |
|
"loss": 0.0, |
|
"num_tokens": 99331.0, |
|
"reward": 0.125, |
|
"reward_std": 0.2314550280570984, |
|
"rewards/equation_reward_func/mean": 0.0, |
|
"rewards/equation_reward_func/std": 0.0, |
|
"rewards/format_reward_func/mean": 0.125, |
|
"rewards/format_reward_func/std": 0.2314550280570984, |
|
"step": 38 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 644.5, |
|
"completions/max_terminated_length": 644.5, |
|
"completions/mean_length": 238.0, |
|
"completions/mean_terminated_length": 238.0, |
|
"completions/min_length": 64.0, |
|
"completions/min_terminated_length": 64.0, |
|
"epoch": 9.063574175158103e-05, |
|
"frac_reward_zero_std": 0.5, |
|
"grad_norm": 0.0022101891227066517, |
|
"kl": 0.003324320729007013, |
|
"learning_rate": 3.484689390623218e-07, |
|
"loss": 0.0, |
|
"num_tokens": 105419.0, |
|
"reward": 0.125, |
|
"reward_std": 0.2314550280570984, |
|
"rewards/equation_reward_func/mean": 0.0, |
|
"rewards/equation_reward_func/std": 0.0, |
|
"rewards/format_reward_func/mean": 0.125, |
|
"rewards/format_reward_func/std": 0.2314550280570984, |
|
"step": 40 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 427.0, |
|
"completions/max_terminated_length": 427.0, |
|
"completions/mean_length": 203.3125, |
|
"completions/mean_terminated_length": 203.3125, |
|
"completions/min_length": 67.0, |
|
"completions/min_terminated_length": 67.0, |
|
"epoch": 9.516752883916008e-05, |
|
"frac_reward_zero_std": 0.5, |
|
"grad_norm": 16.7834529876709, |
|
"kl": 0.002571089873526944, |
|
"learning_rate": 3.3338810791270517e-07, |
|
"loss": 0.0, |
|
"num_tokens": 110992.0, |
|
"reward": 0.0625, |
|
"reward_std": 0.1767766922712326, |
|
"rewards/equation_reward_func/mean": 0.0, |
|
"rewards/equation_reward_func/std": 0.0, |
|
"rewards/format_reward_func/mean": 0.0625, |
|
"rewards/format_reward_func/std": 0.1767766922712326, |
|
"step": 42 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 267.5, |
|
"completions/max_terminated_length": 267.5, |
|
"completions/mean_length": 119.25, |
|
"completions/mean_terminated_length": 119.25, |
|
"completions/min_length": 55.5, |
|
"completions/min_terminated_length": 55.5, |
|
"epoch": 9.969931592673912e-05, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 13.29578685760498, |
|
"kl": 0.0029905991395935416, |
|
"learning_rate": 3.179575180590857e-07, |
|
"loss": 0.0, |
|
"num_tokens": 115204.0, |
|
"reward": 0.125, |
|
"reward_std": 0.3535533845424652, |
|
"rewards/equation_reward_func/mean": 0.0, |
|
"rewards/equation_reward_func/std": 0.0, |
|
"rewards/format_reward_func/mean": 0.125, |
|
"rewards/format_reward_func/std": 0.3535533845424652, |
|
"step": 44 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 326.5, |
|
"completions/max_terminated_length": 326.5, |
|
"completions/mean_length": 152.5, |
|
"completions/mean_terminated_length": 152.5, |
|
"completions/min_length": 26.0, |
|
"completions/min_terminated_length": 26.0, |
|
"epoch": 0.00010423110301431818, |
|
"frac_reward_zero_std": 0.5, |
|
"grad_norm": 11.780332565307617, |
|
"kl": 0.004898008599411696, |
|
"learning_rate": 3.022418907578188e-07, |
|
"loss": 0.0, |
|
"num_tokens": 119916.0, |
|
"reward": 0.0625, |
|
"reward_std": 0.1767766922712326, |
|
"rewards/equation_reward_func/mean": 0.0, |
|
"rewards/equation_reward_func/std": 0.0, |
|
"rewards/format_reward_func/mean": 0.0625, |
|
"rewards/format_reward_func/std": 0.1767766922712326, |
|
"step": 46 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 756.5, |
|
"completions/max_terminated_length": 756.5, |
|
"completions/mean_length": 237.625, |
|
"completions/mean_terminated_length": 237.625, |
|
"completions/min_length": 35.0, |
|
"completions/min_terminated_length": 35.0, |
|
"epoch": 0.00010876289010189723, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 14.306221008300781, |
|
"kl": 0.004835129271668848, |
|
"learning_rate": 2.863071428113726e-07, |
|
"loss": 0.0, |
|
"num_tokens": 125990.0, |
|
"reward": 0.1875, |
|
"reward_std": 0.408231720328331, |
|
"rewards/equation_reward_func/mean": 0.0, |
|
"rewards/equation_reward_func/std": 0.0, |
|
"rewards/format_reward_func/mean": 0.1875, |
|
"rewards/format_reward_func/std": 0.408231720328331, |
|
"step": 48 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 405.5, |
|
"completions/max_terminated_length": 405.5, |
|
"completions/mean_length": 189.9375, |
|
"completions/mean_terminated_length": 189.9375, |
|
"completions/min_length": 67.0, |
|
"completions/min_terminated_length": 67.0, |
|
"epoch": 0.00011329467718947628, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 10.71300983428955, |
|
"kl": 0.006452581874327734, |
|
"learning_rate": 2.7022011009035107e-07, |
|
"loss": 0.0, |
|
"num_tokens": 131301.0, |
|
"reward": 0.125, |
|
"reward_std": 0.3535533845424652, |
|
"rewards/equation_reward_func/mean": 0.0, |
|
"rewards/equation_reward_func/std": 0.0, |
|
"rewards/format_reward_func/mean": 0.125, |
|
"rewards/format_reward_func/std": 0.3535533845424652, |
|
"step": 50 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 361.0, |
|
"completions/max_terminated_length": 361.0, |
|
"completions/mean_length": 157.0625, |
|
"completions/mean_terminated_length": 157.0625, |
|
"completions/min_length": 59.0, |
|
"completions/min_terminated_length": 59.0, |
|
"epoch": 0.00011782646427705534, |
|
"frac_reward_zero_std": 0.5, |
|
"grad_norm": 11.726592063903809, |
|
"kl": 0.0094611946187797, |
|
"learning_rate": 2.540482672006254e-07, |
|
"loss": 0.0, |
|
"num_tokens": 136102.0, |
|
"reward": 0.25, |
|
"reward_std": 0.26726123690605164, |
|
"rewards/equation_reward_func/mean": 0.0, |
|
"rewards/equation_reward_func/std": 0.0, |
|
"rewards/format_reward_func/mean": 0.25, |
|
"rewards/format_reward_func/std": 0.26726123690605164, |
|
"step": 52 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 213.0, |
|
"completions/max_terminated_length": 213.0, |
|
"completions/mean_length": 117.6875, |
|
"completions/mean_terminated_length": 117.6875, |
|
"completions/min_length": 44.5, |
|
"completions/min_terminated_length": 44.5, |
|
"epoch": 0.00012235825136463439, |
|
"frac_reward_zero_std": 0.5, |
|
"grad_norm": 13.030102729797363, |
|
"kl": 0.006068318209145218, |
|
"learning_rate": 2.37859444471388e-07, |
|
"loss": 0.0, |
|
"num_tokens": 140241.0, |
|
"reward": 0.0625, |
|
"reward_std": 0.1767766922712326, |
|
"rewards/equation_reward_func/mean": 0.0, |
|
"rewards/equation_reward_func/std": 0.0, |
|
"rewards/format_reward_func/mean": 0.0625, |
|
"rewards/format_reward_func/std": 0.1767766922712326, |
|
"step": 54 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 333.0, |
|
"completions/max_terminated_length": 333.0, |
|
"completions/mean_length": 136.25, |
|
"completions/mean_terminated_length": 136.25, |
|
"completions/min_length": 36.0, |
|
"completions/min_terminated_length": 36.0, |
|
"epoch": 0.00012689003845221345, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 10.891292572021484, |
|
"kl": 0.009872130773146637, |
|
"learning_rate": 2.2172154345117894e-07, |
|
"loss": 0.0, |
|
"num_tokens": 144701.0, |
|
"reward": 0.25, |
|
"reward_std": 0.4355512708425522, |
|
"rewards/equation_reward_func/mean": 0.0, |
|
"rewards/equation_reward_func/std": 0.0, |
|
"rewards/format_reward_func/mean": 0.25, |
|
"rewards/format_reward_func/std": 0.4355513006448746, |
|
"step": 56 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 663.5, |
|
"completions/max_terminated_length": 663.5, |
|
"completions/mean_length": 196.625, |
|
"completions/mean_terminated_length": 196.625, |
|
"completions/min_length": 59.5, |
|
"completions/min_terminated_length": 59.5, |
|
"epoch": 0.00013142182553979248, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 11.184815406799316, |
|
"kl": 0.01285637664841488, |
|
"learning_rate": 2.0570225210519433e-07, |
|
"loss": 0.0, |
|
"num_tokens": 150159.0, |
|
"reward": 0.1875, |
|
"reward_std": 0.408231720328331, |
|
"rewards/equation_reward_func/mean": 0.0, |
|
"rewards/equation_reward_func/std": 0.0, |
|
"rewards/format_reward_func/mean": 0.1875, |
|
"rewards/format_reward_func/std": 0.408231720328331, |
|
"step": 58 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 308.0, |
|
"completions/max_terminated_length": 308.0, |
|
"completions/mean_length": 140.4375, |
|
"completions/mean_terminated_length": 140.4375, |
|
"completions/min_length": 50.0, |
|
"completions/min_terminated_length": 50.0, |
|
"epoch": 0.00013595361262737154, |
|
"frac_reward_zero_std": 0.5, |
|
"grad_norm": 0.02632570080459118, |
|
"kl": 0.014614543215429876, |
|
"learning_rate": 1.8986876090843664e-07, |
|
"loss": 0.0, |
|
"num_tokens": 154654.0, |
|
"reward": 0.1875, |
|
"reward_std": 0.2587745785713196, |
|
"rewards/equation_reward_func/mean": 0.0, |
|
"rewards/equation_reward_func/std": 0.0, |
|
"rewards/format_reward_func/mean": 0.1875, |
|
"rewards/format_reward_func/std": 0.25877460837364197, |
|
"step": 60 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 270.0, |
|
"completions/max_terminated_length": 270.0, |
|
"completions/mean_length": 104.0625, |
|
"completions/mean_terminated_length": 104.0625, |
|
"completions/min_length": 38.0, |
|
"completions/min_terminated_length": 38.0, |
|
"epoch": 0.0001404853997149506, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 11.164044380187988, |
|
"kl": 0.007626835664268583, |
|
"learning_rate": 1.7428748102551234e-07, |
|
"loss": 0.0, |
|
"num_tokens": 158599.0, |
|
"reward": 0.625, |
|
"reward_std": 0.49871626496315, |
|
"rewards/equation_reward_func/mean": 0.0, |
|
"rewards/equation_reward_func/std": 0.0, |
|
"rewards/format_reward_func/mean": 0.625, |
|
"rewards/format_reward_func/std": 0.49871626496315, |
|
"step": 62 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 160.5, |
|
"completions/max_terminated_length": 160.5, |
|
"completions/mean_length": 90.625, |
|
"completions/mean_terminated_length": 90.625, |
|
"completions/min_length": 47.0, |
|
"completions/min_terminated_length": 47.0, |
|
"epoch": 0.00014501718680252963, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 16.527629852294922, |
|
"kl": 0.011474673578049988, |
|
"learning_rate": 1.5902376575912814e-07, |
|
"loss": 0.0, |
|
"num_tokens": 162289.0, |
|
"reward": 0.3125, |
|
"reward_std": 0.44403792917728424, |
|
"rewards/equation_reward_func/mean": 0.0, |
|
"rewards/equation_reward_func/std": 0.0, |
|
"rewards/format_reward_func/mean": 0.3125, |
|
"rewards/format_reward_func/std": 0.44403792917728424, |
|
"step": 64 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 398.5, |
|
"completions/max_terminated_length": 398.5, |
|
"completions/mean_length": 129.0625, |
|
"completions/mean_terminated_length": 129.0625, |
|
"completions/min_length": 47.0, |
|
"completions/min_terminated_length": 47.0, |
|
"epoch": 0.0001495489738901087, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 12.219941139221191, |
|
"kl": 0.009613552174414508, |
|
"learning_rate": 1.4414163643562753e-07, |
|
"loss": 0.0, |
|
"num_tokens": 166674.0, |
|
"reward": 0.25, |
|
"reward_std": 0.4355512708425522, |
|
"rewards/equation_reward_func/mean": 0.0, |
|
"rewards/equation_reward_func/std": 0.0, |
|
"rewards/format_reward_func/mean": 0.25, |
|
"rewards/format_reward_func/std": 0.4355513006448746, |
|
"step": 66 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 244.5, |
|
"completions/max_terminated_length": 244.5, |
|
"completions/mean_length": 109.1875, |
|
"completions/mean_terminated_length": 109.1875, |
|
"completions/min_length": 45.5, |
|
"completions/min_terminated_length": 45.5, |
|
"epoch": 0.00015408076097768775, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 20.217954635620117, |
|
"kl": 0.020021719275973737, |
|
"learning_rate": 1.2970351387729872e-07, |
|
"loss": 0.0, |
|
"num_tokens": 170693.0, |
|
"reward": 0.375, |
|
"reward_std": 0.5175491571426392, |
|
"rewards/equation_reward_func/mean": 0.0, |
|
"rewards/equation_reward_func/std": 0.0, |
|
"rewards/format_reward_func/mean": 0.375, |
|
"rewards/format_reward_func/std": 0.5175492167472839, |
|
"step": 68 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 329.0, |
|
"completions/max_terminated_length": 329.0, |
|
"completions/mean_length": 127.0625, |
|
"completions/mean_terminated_length": 127.0625, |
|
"completions/min_length": 45.0, |
|
"completions/min_terminated_length": 45.0, |
|
"epoch": 0.0001586125480652668, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 12.027848243713379, |
|
"kl": 0.01639675306796562, |
|
"learning_rate": 1.1576995658775404e-07, |
|
"loss": 0.0, |
|
"num_tokens": 175014.0, |
|
"reward": 0.4375, |
|
"reward_std": 0.5260358154773712, |
|
"rewards/equation_reward_func/mean": 0.0, |
|
"rewards/equation_reward_func/std": 0.0, |
|
"rewards/format_reward_func/mean": 0.4375, |
|
"rewards/format_reward_func/std": 0.5260358452796936, |
|
"step": 70 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 375.0, |
|
"completions/max_terminated_length": 375.0, |
|
"completions/mean_length": 142.1875, |
|
"completions/mean_terminated_length": 142.1875, |
|
"completions/min_length": 52.0, |
|
"completions/min_terminated_length": 52.0, |
|
"epoch": 0.00016314433515284585, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 10.927833557128906, |
|
"kl": 0.011894080380443484, |
|
"learning_rate": 1.0239940674851941e-07, |
|
"loss": 0.0, |
|
"num_tokens": 179577.0, |
|
"reward": 0.25, |
|
"reward_std": 0.4629100561141968, |
|
"rewards/equation_reward_func/mean": 0.0, |
|
"rewards/equation_reward_func/std": 0.0, |
|
"rewards/format_reward_func/mean": 0.25, |
|
"rewards/format_reward_func/std": 0.4629100561141968, |
|
"step": 72 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 951.0, |
|
"completions/max_terminated_length": 951.0, |
|
"completions/mean_length": 273.6875, |
|
"completions/mean_terminated_length": 273.6875, |
|
"completions/min_length": 41.5, |
|
"completions/min_terminated_length": 41.5, |
|
"epoch": 0.0001676761222404249, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 6.783898830413818, |
|
"kl": 0.014139190083369613, |
|
"learning_rate": 8.964794509221507e-08, |
|
"loss": 0.0, |
|
"num_tokens": 186236.0, |
|
"reward": 0.375, |
|
"reward_std": 0.4355512708425522, |
|
"rewards/equation_reward_func/mean": 0.0, |
|
"rewards/equation_reward_func/std": 0.0, |
|
"rewards/format_reward_func/mean": 0.375, |
|
"rewards/format_reward_func/std": 0.4355513006448746, |
|
"step": 74 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 174.5, |
|
"completions/max_terminated_length": 174.5, |
|
"completions/mean_length": 106.25, |
|
"completions/mean_terminated_length": 106.25, |
|
"completions/min_length": 44.0, |
|
"completions/min_terminated_length": 44.0, |
|
"epoch": 0.00017220790932800394, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 12.524798393249512, |
|
"kl": 0.013552291362429969, |
|
"learning_rate": 7.756905568047392e-08, |
|
"loss": 0.0, |
|
"num_tokens": 190168.0, |
|
"reward": 0.5625, |
|
"reward_std": 0.5260358154773712, |
|
"rewards/equation_reward_func/mean": 0.0, |
|
"rewards/equation_reward_func/std": 0.0, |
|
"rewards/format_reward_func/mean": 0.5625, |
|
"rewards/format_reward_func/std": 0.5260358452796936, |
|
"step": 76 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 647.5, |
|
"completions/max_terminated_length": 647.5, |
|
"completions/mean_length": 207.5625, |
|
"completions/mean_terminated_length": 207.5625, |
|
"completions/min_length": 46.0, |
|
"completions/min_terminated_length": 46.0, |
|
"epoch": 0.000176739696415583, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 10.13427448272705, |
|
"kl": 0.01421075320104137, |
|
"learning_rate": 6.621340157319996e-08, |
|
"loss": 0.0, |
|
"num_tokens": 195793.0, |
|
"reward": 0.375, |
|
"reward_std": 0.4355512708425522, |
|
"rewards/equation_reward_func/mean": 0.0, |
|
"rewards/equation_reward_func/std": 0.0, |
|
"rewards/format_reward_func/mean": 0.375, |
|
"rewards/format_reward_func/std": 0.4355513006448746, |
|
"step": 78 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0625, |
|
"completions/max_length": 620.0, |
|
"completions/max_terminated_length": 281.0, |
|
"completions/mean_length": 187.5625, |
|
"completions/mean_terminated_length": 134.85714721679688, |
|
"completions/min_length": 40.0, |
|
"completions/min_terminated_length": 40.0, |
|
"epoch": 0.00018127148350316206, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 16.1104793548584, |
|
"kl": 0.014304678879852872, |
|
"learning_rate": 5.5628612330087724e-08, |
|
"loss": 0.0, |
|
"num_tokens": 201050.0, |
|
"reward": 0.25, |
|
"reward_std": 0.4355512708425522, |
|
"rewards/equation_reward_func/mean": 0.0, |
|
"rewards/equation_reward_func/std": 0.0, |
|
"rewards/format_reward_func/mean": 0.25, |
|
"rewards/format_reward_func/std": 0.4355513006448746, |
|
"step": 80 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0625, |
|
"completions/max_length": 746.0, |
|
"completions/max_terminated_length": 345.5, |
|
"completions/mean_length": 174.6875, |
|
"completions/mean_terminated_length": 116.39286041259766, |
|
"completions/min_length": 50.5, |
|
"completions/min_terminated_length": 50.5, |
|
"epoch": 0.0001858032705907411, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 13.687472343444824, |
|
"kl": 0.012647167037357576, |
|
"learning_rate": 4.5859084235697235e-08, |
|
"loss": 0.0, |
|
"num_tokens": 206125.0, |
|
"reward": 0.25, |
|
"reward_std": 0.4629100561141968, |
|
"rewards/equation_reward_func/mean": 0.0, |
|
"rewards/equation_reward_func/std": 0.0, |
|
"rewards/format_reward_func/mean": 0.25, |
|
"rewards/format_reward_func/std": 0.4629100561141968, |
|
"step": 82 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 240.5, |
|
"completions/max_terminated_length": 240.5, |
|
"completions/mean_length": 104.75, |
|
"completions/mean_terminated_length": 104.75, |
|
"completions/min_length": 56.0, |
|
"completions/min_terminated_length": 56.0, |
|
"epoch": 0.00019033505767832015, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 27.33205795288086, |
|
"kl": 0.01843659658334218, |
|
"learning_rate": 3.6945794086007705e-08, |
|
"loss": 0.0, |
|
"num_tokens": 210097.0, |
|
"reward": 0.625, |
|
"reward_std": 0.49871626496315, |
|
"rewards/equation_reward_func/mean": 0.0, |
|
"rewards/equation_reward_func/std": 0.0, |
|
"rewards/format_reward_func/mean": 0.625, |
|
"rewards/format_reward_func/std": 0.49871626496315, |
|
"step": 84 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 516.5, |
|
"completions/max_terminated_length": 516.5, |
|
"completions/mean_length": 146.9375, |
|
"completions/mean_terminated_length": 146.9375, |
|
"completions/min_length": 45.0, |
|
"completions/min_terminated_length": 45.0, |
|
"epoch": 0.00019486684476589921, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 17.247251510620117, |
|
"kl": 0.014375057930010371, |
|
"learning_rate": 2.892612731749414e-08, |
|
"loss": 0.0, |
|
"num_tokens": 214696.0, |
|
"reward": 0.375, |
|
"reward_std": 0.5175491571426392, |
|
"rewards/equation_reward_func/mean": 0.0, |
|
"rewards/equation_reward_func/std": 0.0, |
|
"rewards/format_reward_func/mean": 0.375, |
|
"rewards/format_reward_func/std": 0.5175492167472839, |
|
"step": 86 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 176.0, |
|
"completions/max_terminated_length": 176.0, |
|
"completions/mean_length": 92.375, |
|
"completions/mean_terminated_length": 92.375, |
|
"completions/min_length": 39.0, |
|
"completions/min_terminated_length": 39.0, |
|
"epoch": 0.00019939863185347825, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 12.001086235046387, |
|
"kl": 0.015669465501559898, |
|
"learning_rate": 2.183372119961499e-08, |
|
"loss": 0.0, |
|
"num_tokens": 218470.0, |
|
"reward": 0.1875, |
|
"reward_std": 0.408231720328331, |
|
"rewards/equation_reward_func/mean": 0.0, |
|
"rewards/equation_reward_func/std": 0.0, |
|
"rewards/format_reward_func/mean": 0.1875, |
|
"rewards/format_reward_func/std": 0.408231720328331, |
|
"step": 88 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 522.0, |
|
"completions/max_terminated_length": 522.0, |
|
"completions/mean_length": 174.0, |
|
"completions/mean_terminated_length": 174.0, |
|
"completions/min_length": 55.5, |
|
"completions/min_terminated_length": 55.5, |
|
"epoch": 0.0002039304189410573, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 11.844170570373535, |
|
"kl": 0.009546459768898785, |
|
"learning_rate": 1.5698323748414122e-08, |
|
"loss": 0.0, |
|
"num_tokens": 223534.0, |
|
"reward": 0.375, |
|
"reward_std": 0.4355512708425522, |
|
"rewards/equation_reward_func/mean": 0.0, |
|
"rewards/equation_reward_func/std": 0.0, |
|
"rewards/format_reward_func/mean": 0.375, |
|
"rewards/format_reward_func/std": 0.4355513006448746, |
|
"step": 90 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 331.5, |
|
"completions/max_terminated_length": 331.5, |
|
"completions/mean_length": 151.8125, |
|
"completions/mean_terminated_length": 151.8125, |
|
"completions/min_length": 48.5, |
|
"completions/min_terminated_length": 48.5, |
|
"epoch": 0.00020846220602863637, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 11.816337585449219, |
|
"kl": 0.01593888070783578, |
|
"learning_rate": 1.054566895300324e-08, |
|
"loss": 0.0, |
|
"num_tokens": 228243.0, |
|
"reward": 0.5625, |
|
"reward_std": 0.5260358154773712, |
|
"rewards/equation_reward_func/mean": 0.0, |
|
"rewards/equation_reward_func/std": 0.0, |
|
"rewards/format_reward_func/mean": 0.5625, |
|
"rewards/format_reward_func/std": 0.5260358452796936, |
|
"step": 92 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 387.0, |
|
"completions/max_terminated_length": 387.0, |
|
"completions/mean_length": 163.6875, |
|
"completions/mean_terminated_length": 163.6875, |
|
"completions/min_length": 33.0, |
|
"completions/min_terminated_length": 33.0, |
|
"epoch": 0.0002129939931162154, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 11.730420112609863, |
|
"kl": 0.01412112163961865, |
|
"learning_rate": 6.397368838268496e-09, |
|
"loss": 0.0, |
|
"num_tokens": 233110.0, |
|
"reward": 0.1875, |
|
"reward_std": 0.408231720328331, |
|
"rewards/equation_reward_func/mean": 0.0, |
|
"rewards/equation_reward_func/std": 0.0, |
|
"rewards/format_reward_func/mean": 0.1875, |
|
"rewards/format_reward_func/std": 0.408231720328331, |
|
"step": 94 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 420.0, |
|
"completions/max_terminated_length": 420.0, |
|
"completions/mean_length": 192.4375, |
|
"completions/mean_terminated_length": 192.4375, |
|
"completions/min_length": 50.0, |
|
"completions/min_terminated_length": 50.0, |
|
"epoch": 0.00021752578020379446, |
|
"frac_reward_zero_std": 0.5, |
|
"grad_norm": 15.307268142700195, |
|
"kl": 0.00981484999647364, |
|
"learning_rate": 3.2708228165273244e-09, |
|
"loss": 0.0, |
|
"num_tokens": 238493.0, |
|
"reward": 0.0625, |
|
"reward_std": 0.1767766922712326, |
|
"rewards/equation_reward_func/mean": 0.0, |
|
"rewards/equation_reward_func/std": 0.0, |
|
"rewards/format_reward_func/mean": 0.0625, |
|
"rewards/format_reward_func/std": 0.1767766922712326, |
|
"step": 96 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 372.5, |
|
"completions/max_terminated_length": 372.5, |
|
"completions/mean_length": 108.1875, |
|
"completions/mean_terminated_length": 108.1875, |
|
"completions/min_length": 37.0, |
|
"completions/min_terminated_length": 37.0, |
|
"epoch": 0.00022205756729137352, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 23.4942684173584, |
|
"kl": 0.022864260390633717, |
|
"learning_rate": 1.1791447083465133e-09, |
|
"loss": 0.0, |
|
"num_tokens": 242536.0, |
|
"reward": 0.25, |
|
"reward_std": 0.4629100561141968, |
|
"rewards/equation_reward_func/mean": 0.0, |
|
"rewards/equation_reward_func/std": 0.0, |
|
"rewards/format_reward_func/mean": 0.25, |
|
"rewards/format_reward_func/std": 0.4629100561141968, |
|
"step": 98 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 426.0, |
|
"completions/max_terminated_length": 426.0, |
|
"completions/mean_length": 194.3125, |
|
"completions/mean_terminated_length": 194.3125, |
|
"completions/min_length": 37.0, |
|
"completions/min_terminated_length": 37.0, |
|
"epoch": 0.00022658935437895256, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 23.61033058166504, |
|
"kl": 0.0231838297622744, |
|
"learning_rate": 1.3110773862126667e-10, |
|
"loss": 0.0, |
|
"num_tokens": 247917.0, |
|
"reward": 0.4375, |
|
"reward_std": 0.5260358154773712, |
|
"rewards/equation_reward_func/mean": 0.0, |
|
"rewards/equation_reward_func/std": 0.0, |
|
"rewards/format_reward_func/mean": 0.4375, |
|
"rewards/format_reward_func/std": 0.5260358452796936, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.00022658935437895256, |
|
"step": 100, |
|
"total_flos": 0.0, |
|
"train_loss": 8.04626297735922e-06, |
|
"train_runtime": 2640.6632, |
|
"train_samples_per_second": 0.303, |
|
"train_steps_per_second": 0.038 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 100, |
|
"num_input_tokens_seen": 247917, |
|
"num_train_epochs": 1, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|