weege007's picture
Model save
69cd9b6 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.00022658935437895256,
"eval_steps": 500,
"global_step": 100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 656.0,
"completions/max_terminated_length": 222.5,
"completions/mean_length": 194.0625,
"completions/mean_terminated_length": 136.94643020629883,
"completions/min_length": 61.5,
"completions/min_terminated_length": 61.5,
"epoch": 4.531787087579051e-06,
"frac_reward_zero_std": 0.5,
"grad_norm": 12.525100708007812,
"kl": 0.0,
"learning_rate": 1.6666666666666665e-07,
"loss": -0.0,
"num_tokens": 5377.0,
"reward": 0.0625,
"reward_std": 0.1767766922712326,
"rewards/equation_reward_func/mean": 0.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0625,
"rewards/format_reward_func/std": 0.1767766922712326,
"step": 2
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 618.0,
"completions/max_terminated_length": 198.0,
"completions/mean_length": 171.25,
"completions/mean_terminated_length": 114.93750381469727,
"completions/min_length": 57.5,
"completions/min_terminated_length": 57.5,
"epoch": 9.063574175158102e-06,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.001167318900115788,
"kl": 0.0009191570134134963,
"learning_rate": 5e-07,
"loss": 0.0,
"num_tokens": 10397.0,
"reward": 0.0625,
"reward_std": 0.1767766922712326,
"rewards/equation_reward_func/mean": 0.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0625,
"rewards/format_reward_func/std": 0.1767766922712326,
"step": 4
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 477.0,
"completions/max_terminated_length": 477.0,
"completions/mean_length": 175.5,
"completions/mean_terminated_length": 175.5,
"completions/min_length": 38.5,
"completions/min_terminated_length": 38.5,
"epoch": 1.3595361262737154e-05,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0017322486964985728,
"kl": 0.0009386140663991682,
"learning_rate": 4.994757065594279e-07,
"loss": 0.0,
"num_tokens": 15493.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 0.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 6
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 307.0,
"completions/max_terminated_length": 307.0,
"completions/mean_length": 156.25,
"completions/mean_terminated_length": 156.25,
"completions/min_length": 47.0,
"completions/min_terminated_length": 47.0,
"epoch": 1.8127148350316204e-05,
"frac_reward_zero_std": 0.5,
"grad_norm": 11.28031063079834,
"kl": 0.0007646345866305637,
"learning_rate": 4.979050253066063e-07,
"loss": 0.0,
"num_tokens": 20313.0,
"reward": 0.0625,
"reward_std": 0.1767766922712326,
"rewards/equation_reward_func/mean": 0.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0625,
"rewards/format_reward_func/std": 0.1767766922712326,
"step": 8
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 312.5,
"completions/max_terminated_length": 312.5,
"completions/mean_length": 152.25,
"completions/mean_terminated_length": 152.25,
"completions/min_length": 65.0,
"completions/min_terminated_length": 65.0,
"epoch": 2.2658935437895258e-05,
"frac_reward_zero_std": 0.5,
"grad_norm": 9.902769088745117,
"kl": 0.0008557607743568951,
"learning_rate": 4.952945442245597e-07,
"loss": 0.0,
"num_tokens": 25061.0,
"reward": 0.0625,
"reward_std": 0.1767766922712326,
"rewards/equation_reward_func/mean": 0.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0625,
"rewards/format_reward_func/std": 0.1767766922712326,
"step": 10
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 543.5,
"completions/max_terminated_length": 543.5,
"completions/mean_length": 164.4375,
"completions/mean_terminated_length": 164.4375,
"completions/min_length": 38.0,
"completions/min_terminated_length": 38.0,
"epoch": 2.7190722525474308e-05,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.0027156081050634384,
"kl": 0.0010605865281831939,
"learning_rate": 4.916552125781528e-07,
"loss": 0.0,
"num_tokens": 29980.0,
"reward": 0.0625,
"reward_std": 0.1767766922712326,
"rewards/equation_reward_func/mean": 0.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0625,
"rewards/format_reward_func/std": 0.1767766922712326,
"step": 12
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 451.0,
"completions/max_terminated_length": 451.0,
"completions/mean_length": 211.0,
"completions/mean_terminated_length": 211.0,
"completions/min_length": 70.5,
"completions/min_terminated_length": 70.5,
"epoch": 3.172250961305336e-05,
"frac_reward_zero_std": 0.5,
"grad_norm": 12.591176986694336,
"kl": 0.0009377936348755611,
"learning_rate": 4.870022949890676e-07,
"loss": 0.0,
"num_tokens": 35676.0,
"reward": 0.0625,
"reward_std": 0.1767766922712326,
"rewards/equation_reward_func/mean": 0.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0625,
"rewards/format_reward_func/std": 0.1767766922712326,
"step": 14
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 465.5,
"completions/max_terminated_length": 465.5,
"completions/mean_length": 163.375,
"completions/mean_terminated_length": 163.375,
"completions/min_length": 49.0,
"completions/min_terminated_length": 49.0,
"epoch": 3.625429670063241e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 10.893583297729492,
"kl": 0.001199566273498931,
"learning_rate": 4.81355307410676e-07,
"loss": 0.0,
"num_tokens": 40570.0,
"reward": 0.125,
"reward_std": 0.3535533845424652,
"rewards/equation_reward_func/mean": 0.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.125,
"rewards/format_reward_func/std": 0.3535533845424652,
"step": 16
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 209.75,
"completions/mean_terminated_length": 209.75,
"completions/min_length": 63.0,
"completions/min_terminated_length": 63.0,
"epoch": 4.078608378821146e-05,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0009183982037939131,
"kl": 0.0008469254862575326,
"learning_rate": 4.747379352713488e-07,
"loss": 0.0,
"num_tokens": 46174.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 0.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 18
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 583.0,
"completions/max_terminated_length": 583.0,
"completions/mean_length": 211.125,
"completions/mean_terminated_length": 211.125,
"completions/min_length": 69.0,
"completions/min_terminated_length": 69.0,
"epoch": 4.5317870875790515e-05,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0012323512928560376,
"kl": 0.0012486951891332865,
"learning_rate": 4.6717793412953776e-07,
"loss": 0.0,
"num_tokens": 51832.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 0.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 20
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 620.5,
"completions/max_terminated_length": 285.5,
"completions/mean_length": 202.9375,
"completions/mean_terminated_length": 150.90178680419922,
"completions/min_length": 60.5,
"completions/min_terminated_length": 60.5,
"epoch": 4.984965796336956e-05,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0016945754177868366,
"kl": 0.00123220352907083,
"learning_rate": 4.5870701325731773e-07,
"loss": 0.0,
"num_tokens": 57327.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 0.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 22
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 683.0,
"completions/max_terminated_length": 434.5,
"completions/mean_length": 257.0,
"completions/mean_terminated_length": 209.44644165039062,
"completions/min_length": 84.5,
"completions/min_terminated_length": 84.5,
"epoch": 5.4381445050948616e-05,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.00123355642426759,
"kl": 0.0011914248134416994,
"learning_rate": 4.4936070264068016e-07,
"loss": 0.0,
"num_tokens": 63719.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 0.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 24
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 641.0,
"completions/max_terminated_length": 235.5,
"completions/mean_length": 181.3125,
"completions/mean_terminated_length": 125.05357360839844,
"completions/min_length": 35.0,
"completions/min_terminated_length": 35.0,
"epoch": 5.891323213852767e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 13.299399375915527,
"kl": 0.0018970294222526718,
"learning_rate": 4.391782039544238e-07,
"loss": 0.0,
"num_tokens": 68924.0,
"reward": 0.125,
"reward_std": 0.3535533845424652,
"rewards/equation_reward_func/mean": 0.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.125,
"rewards/format_reward_func/std": 0.3535533845424652,
"step": 26
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 214.5,
"completions/max_terminated_length": 214.5,
"completions/mean_length": 108.6875,
"completions/mean_terminated_length": 108.6875,
"completions/min_length": 34.0,
"completions/min_terminated_length": 34.0,
"epoch": 6.344501922610672e-05,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.0035000136122107506,
"kl": 0.0019113743601337774,
"learning_rate": 4.282022261367073e-07,
"loss": 0.0,
"num_tokens": 72919.0,
"reward": 0.0625,
"reward_std": 0.1767766922712326,
"rewards/equation_reward_func/mean": 0.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0625,
"rewards/format_reward_func/std": 0.1767766922712326,
"step": 28
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 583.5,
"completions/max_terminated_length": 583.5,
"completions/mean_length": 217.625,
"completions/mean_terminated_length": 217.625,
"completions/min_length": 37.0,
"completions/min_terminated_length": 37.0,
"epoch": 6.797680631368577e-05,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0021501195151358843,
"kl": 0.0015104188605619129,
"learning_rate": 4.1647880625292027e-07,
"loss": 0.0,
"num_tokens": 78713.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 0.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 30
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 335.5,
"completions/max_terminated_length": 335.5,
"completions/mean_length": 134.75,
"completions/mean_terminated_length": 134.75,
"completions/min_length": 32.0,
"completions/min_terminated_length": 32.0,
"epoch": 7.250859340126482e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 14.151634216308594,
"kl": 0.0026571638591121882,
"learning_rate": 4.040571164002318e-07,
"loss": 0.0,
"num_tokens": 83149.0,
"reward": 0.1875,
"reward_std": 0.408231720328331,
"rewards/equation_reward_func/mean": 0.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.1875,
"rewards/format_reward_func/std": 0.408231720328331,
"step": 32
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 381.5,
"completions/max_terminated_length": 381.5,
"completions/mean_length": 190.625,
"completions/mean_terminated_length": 190.625,
"completions/min_length": 34.5,
"completions/min_terminated_length": 34.5,
"epoch": 7.704038048884388e-05,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.006240386515855789,
"kl": 0.0022764305977034383,
"learning_rate": 3.909892574627266e-07,
"loss": 0.0,
"num_tokens": 88487.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 0.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 34
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 745.0,
"completions/max_terminated_length": 745.0,
"completions/mean_length": 219.75,
"completions/mean_terminated_length": 219.75,
"completions/min_length": 67.5,
"completions/min_terminated_length": 67.5,
"epoch": 8.157216757642292e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 16.2641544342041,
"kl": 0.002281889770529233,
"learning_rate": 3.773300405821908e-07,
"loss": 0.0,
"num_tokens": 94251.0,
"reward": 0.125,
"reward_std": 0.3535533845424652,
"rewards/equation_reward_func/mean": 0.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.125,
"rewards/format_reward_func/std": 0.3535533845424652,
"step": 36
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 436.0,
"completions/max_terminated_length": 436.0,
"completions/mean_length": 172.5,
"completions/mean_terminated_length": 172.5,
"completions/min_length": 66.0,
"completions/min_terminated_length": 66.0,
"epoch": 8.610395466400197e-05,
"frac_reward_zero_std": 0.5,
"grad_norm": 15.585210800170898,
"kl": 0.010958031325571937,
"learning_rate": 3.6313675726113475e-07,
"loss": 0.0,
"num_tokens": 99331.0,
"reward": 0.125,
"reward_std": 0.2314550280570984,
"rewards/equation_reward_func/mean": 0.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.125,
"rewards/format_reward_func/std": 0.2314550280570984,
"step": 38
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 644.5,
"completions/max_terminated_length": 644.5,
"completions/mean_length": 238.0,
"completions/mean_terminated_length": 238.0,
"completions/min_length": 64.0,
"completions/min_terminated_length": 64.0,
"epoch": 9.063574175158103e-05,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.0022101891227066517,
"kl": 0.003324320729007013,
"learning_rate": 3.484689390623218e-07,
"loss": 0.0,
"num_tokens": 105419.0,
"reward": 0.125,
"reward_std": 0.2314550280570984,
"rewards/equation_reward_func/mean": 0.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.125,
"rewards/format_reward_func/std": 0.2314550280570984,
"step": 40
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 427.0,
"completions/max_terminated_length": 427.0,
"completions/mean_length": 203.3125,
"completions/mean_terminated_length": 203.3125,
"completions/min_length": 67.0,
"completions/min_terminated_length": 67.0,
"epoch": 9.516752883916008e-05,
"frac_reward_zero_std": 0.5,
"grad_norm": 16.7834529876709,
"kl": 0.002571089873526944,
"learning_rate": 3.3338810791270517e-07,
"loss": 0.0,
"num_tokens": 110992.0,
"reward": 0.0625,
"reward_std": 0.1767766922712326,
"rewards/equation_reward_func/mean": 0.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0625,
"rewards/format_reward_func/std": 0.1767766922712326,
"step": 42
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 267.5,
"completions/max_terminated_length": 267.5,
"completions/mean_length": 119.25,
"completions/mean_terminated_length": 119.25,
"completions/min_length": 55.5,
"completions/min_terminated_length": 55.5,
"epoch": 9.969931592673912e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 13.29578685760498,
"kl": 0.0029905991395935416,
"learning_rate": 3.179575180590857e-07,
"loss": 0.0,
"num_tokens": 115204.0,
"reward": 0.125,
"reward_std": 0.3535533845424652,
"rewards/equation_reward_func/mean": 0.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.125,
"rewards/format_reward_func/std": 0.3535533845424652,
"step": 44
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 326.5,
"completions/max_terminated_length": 326.5,
"completions/mean_length": 152.5,
"completions/mean_terminated_length": 152.5,
"completions/min_length": 26.0,
"completions/min_terminated_length": 26.0,
"epoch": 0.00010423110301431818,
"frac_reward_zero_std": 0.5,
"grad_norm": 11.780332565307617,
"kl": 0.004898008599411696,
"learning_rate": 3.022418907578188e-07,
"loss": 0.0,
"num_tokens": 119916.0,
"reward": 0.0625,
"reward_std": 0.1767766922712326,
"rewards/equation_reward_func/mean": 0.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0625,
"rewards/format_reward_func/std": 0.1767766922712326,
"step": 46
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 756.5,
"completions/max_terminated_length": 756.5,
"completions/mean_length": 237.625,
"completions/mean_terminated_length": 237.625,
"completions/min_length": 35.0,
"completions/min_terminated_length": 35.0,
"epoch": 0.00010876289010189723,
"frac_reward_zero_std": 0.0,
"grad_norm": 14.306221008300781,
"kl": 0.004835129271668848,
"learning_rate": 2.863071428113726e-07,
"loss": 0.0,
"num_tokens": 125990.0,
"reward": 0.1875,
"reward_std": 0.408231720328331,
"rewards/equation_reward_func/mean": 0.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.1875,
"rewards/format_reward_func/std": 0.408231720328331,
"step": 48
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 405.5,
"completions/max_terminated_length": 405.5,
"completions/mean_length": 189.9375,
"completions/mean_terminated_length": 189.9375,
"completions/min_length": 67.0,
"completions/min_terminated_length": 67.0,
"epoch": 0.00011329467718947628,
"frac_reward_zero_std": 0.0,
"grad_norm": 10.71300983428955,
"kl": 0.006452581874327734,
"learning_rate": 2.7022011009035107e-07,
"loss": 0.0,
"num_tokens": 131301.0,
"reward": 0.125,
"reward_std": 0.3535533845424652,
"rewards/equation_reward_func/mean": 0.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.125,
"rewards/format_reward_func/std": 0.3535533845424652,
"step": 50
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 361.0,
"completions/max_terminated_length": 361.0,
"completions/mean_length": 157.0625,
"completions/mean_terminated_length": 157.0625,
"completions/min_length": 59.0,
"completions/min_terminated_length": 59.0,
"epoch": 0.00011782646427705534,
"frac_reward_zero_std": 0.5,
"grad_norm": 11.726592063903809,
"kl": 0.0094611946187797,
"learning_rate": 2.540482672006254e-07,
"loss": 0.0,
"num_tokens": 136102.0,
"reward": 0.25,
"reward_std": 0.26726123690605164,
"rewards/equation_reward_func/mean": 0.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.25,
"rewards/format_reward_func/std": 0.26726123690605164,
"step": 52
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 213.0,
"completions/max_terminated_length": 213.0,
"completions/mean_length": 117.6875,
"completions/mean_terminated_length": 117.6875,
"completions/min_length": 44.5,
"completions/min_terminated_length": 44.5,
"epoch": 0.00012235825136463439,
"frac_reward_zero_std": 0.5,
"grad_norm": 13.030102729797363,
"kl": 0.006068318209145218,
"learning_rate": 2.37859444471388e-07,
"loss": 0.0,
"num_tokens": 140241.0,
"reward": 0.0625,
"reward_std": 0.1767766922712326,
"rewards/equation_reward_func/mean": 0.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0625,
"rewards/format_reward_func/std": 0.1767766922712326,
"step": 54
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 333.0,
"completions/max_terminated_length": 333.0,
"completions/mean_length": 136.25,
"completions/mean_terminated_length": 136.25,
"completions/min_length": 36.0,
"completions/min_terminated_length": 36.0,
"epoch": 0.00012689003845221345,
"frac_reward_zero_std": 0.0,
"grad_norm": 10.891292572021484,
"kl": 0.009872130773146637,
"learning_rate": 2.2172154345117894e-07,
"loss": 0.0,
"num_tokens": 144701.0,
"reward": 0.25,
"reward_std": 0.4355512708425522,
"rewards/equation_reward_func/mean": 0.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.25,
"rewards/format_reward_func/std": 0.4355513006448746,
"step": 56
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 663.5,
"completions/max_terminated_length": 663.5,
"completions/mean_length": 196.625,
"completions/mean_terminated_length": 196.625,
"completions/min_length": 59.5,
"completions/min_terminated_length": 59.5,
"epoch": 0.00013142182553979248,
"frac_reward_zero_std": 0.0,
"grad_norm": 11.184815406799316,
"kl": 0.01285637664841488,
"learning_rate": 2.0570225210519433e-07,
"loss": 0.0,
"num_tokens": 150159.0,
"reward": 0.1875,
"reward_std": 0.408231720328331,
"rewards/equation_reward_func/mean": 0.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.1875,
"rewards/format_reward_func/std": 0.408231720328331,
"step": 58
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 308.0,
"completions/max_terminated_length": 308.0,
"completions/mean_length": 140.4375,
"completions/mean_terminated_length": 140.4375,
"completions/min_length": 50.0,
"completions/min_terminated_length": 50.0,
"epoch": 0.00013595361262737154,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.02632570080459118,
"kl": 0.014614543215429876,
"learning_rate": 1.8986876090843664e-07,
"loss": 0.0,
"num_tokens": 154654.0,
"reward": 0.1875,
"reward_std": 0.2587745785713196,
"rewards/equation_reward_func/mean": 0.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.1875,
"rewards/format_reward_func/std": 0.25877460837364197,
"step": 60
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 270.0,
"completions/max_terminated_length": 270.0,
"completions/mean_length": 104.0625,
"completions/mean_terminated_length": 104.0625,
"completions/min_length": 38.0,
"completions/min_terminated_length": 38.0,
"epoch": 0.0001404853997149506,
"frac_reward_zero_std": 0.0,
"grad_norm": 11.164044380187988,
"kl": 0.007626835664268583,
"learning_rate": 1.7428748102551234e-07,
"loss": 0.0,
"num_tokens": 158599.0,
"reward": 0.625,
"reward_std": 0.49871626496315,
"rewards/equation_reward_func/mean": 0.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.625,
"rewards/format_reward_func/std": 0.49871626496315,
"step": 62
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 160.5,
"completions/max_terminated_length": 160.5,
"completions/mean_length": 90.625,
"completions/mean_terminated_length": 90.625,
"completions/min_length": 47.0,
"completions/min_terminated_length": 47.0,
"epoch": 0.00014501718680252963,
"frac_reward_zero_std": 0.0,
"grad_norm": 16.527629852294922,
"kl": 0.011474673578049988,
"learning_rate": 1.5902376575912814e-07,
"loss": 0.0,
"num_tokens": 162289.0,
"reward": 0.3125,
"reward_std": 0.44403792917728424,
"rewards/equation_reward_func/mean": 0.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.3125,
"rewards/format_reward_func/std": 0.44403792917728424,
"step": 64
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 398.5,
"completions/max_terminated_length": 398.5,
"completions/mean_length": 129.0625,
"completions/mean_terminated_length": 129.0625,
"completions/min_length": 47.0,
"completions/min_terminated_length": 47.0,
"epoch": 0.0001495489738901087,
"frac_reward_zero_std": 0.0,
"grad_norm": 12.219941139221191,
"kl": 0.009613552174414508,
"learning_rate": 1.4414163643562753e-07,
"loss": 0.0,
"num_tokens": 166674.0,
"reward": 0.25,
"reward_std": 0.4355512708425522,
"rewards/equation_reward_func/mean": 0.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.25,
"rewards/format_reward_func/std": 0.4355513006448746,
"step": 66
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 244.5,
"completions/max_terminated_length": 244.5,
"completions/mean_length": 109.1875,
"completions/mean_terminated_length": 109.1875,
"completions/min_length": 45.5,
"completions/min_terminated_length": 45.5,
"epoch": 0.00015408076097768775,
"frac_reward_zero_std": 0.0,
"grad_norm": 20.217954635620117,
"kl": 0.020021719275973737,
"learning_rate": 1.2970351387729872e-07,
"loss": 0.0,
"num_tokens": 170693.0,
"reward": 0.375,
"reward_std": 0.5175491571426392,
"rewards/equation_reward_func/mean": 0.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.375,
"rewards/format_reward_func/std": 0.5175492167472839,
"step": 68
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 329.0,
"completions/max_terminated_length": 329.0,
"completions/mean_length": 127.0625,
"completions/mean_terminated_length": 127.0625,
"completions/min_length": 45.0,
"completions/min_terminated_length": 45.0,
"epoch": 0.0001586125480652668,
"frac_reward_zero_std": 0.0,
"grad_norm": 12.027848243713379,
"kl": 0.01639675306796562,
"learning_rate": 1.1576995658775404e-07,
"loss": 0.0,
"num_tokens": 175014.0,
"reward": 0.4375,
"reward_std": 0.5260358154773712,
"rewards/equation_reward_func/mean": 0.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.4375,
"rewards/format_reward_func/std": 0.5260358452796936,
"step": 70
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 375.0,
"completions/max_terminated_length": 375.0,
"completions/mean_length": 142.1875,
"completions/mean_terminated_length": 142.1875,
"completions/min_length": 52.0,
"completions/min_terminated_length": 52.0,
"epoch": 0.00016314433515284585,
"frac_reward_zero_std": 0.0,
"grad_norm": 10.927833557128906,
"kl": 0.011894080380443484,
"learning_rate": 1.0239940674851941e-07,
"loss": 0.0,
"num_tokens": 179577.0,
"reward": 0.25,
"reward_std": 0.4629100561141968,
"rewards/equation_reward_func/mean": 0.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.25,
"rewards/format_reward_func/std": 0.4629100561141968,
"step": 72
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 951.0,
"completions/max_terminated_length": 951.0,
"completions/mean_length": 273.6875,
"completions/mean_terminated_length": 273.6875,
"completions/min_length": 41.5,
"completions/min_terminated_length": 41.5,
"epoch": 0.0001676761222404249,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.783898830413818,
"kl": 0.014139190083369613,
"learning_rate": 8.964794509221507e-08,
"loss": 0.0,
"num_tokens": 186236.0,
"reward": 0.375,
"reward_std": 0.4355512708425522,
"rewards/equation_reward_func/mean": 0.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.375,
"rewards/format_reward_func/std": 0.4355513006448746,
"step": 74
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 174.5,
"completions/max_terminated_length": 174.5,
"completions/mean_length": 106.25,
"completions/mean_terminated_length": 106.25,
"completions/min_length": 44.0,
"completions/min_terminated_length": 44.0,
"epoch": 0.00017220790932800394,
"frac_reward_zero_std": 0.0,
"grad_norm": 12.524798393249512,
"kl": 0.013552291362429969,
"learning_rate": 7.756905568047392e-08,
"loss": 0.0,
"num_tokens": 190168.0,
"reward": 0.5625,
"reward_std": 0.5260358154773712,
"rewards/equation_reward_func/mean": 0.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.5625,
"rewards/format_reward_func/std": 0.5260358452796936,
"step": 76
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 647.5,
"completions/max_terminated_length": 647.5,
"completions/mean_length": 207.5625,
"completions/mean_terminated_length": 207.5625,
"completions/min_length": 46.0,
"completions/min_terminated_length": 46.0,
"epoch": 0.000176739696415583,
"frac_reward_zero_std": 0.0,
"grad_norm": 10.13427448272705,
"kl": 0.01421075320104137,
"learning_rate": 6.621340157319996e-08,
"loss": 0.0,
"num_tokens": 195793.0,
"reward": 0.375,
"reward_std": 0.4355512708425522,
"rewards/equation_reward_func/mean": 0.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.375,
"rewards/format_reward_func/std": 0.4355513006448746,
"step": 78
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 620.0,
"completions/max_terminated_length": 281.0,
"completions/mean_length": 187.5625,
"completions/mean_terminated_length": 134.85714721679688,
"completions/min_length": 40.0,
"completions/min_terminated_length": 40.0,
"epoch": 0.00018127148350316206,
"frac_reward_zero_std": 0.0,
"grad_norm": 16.1104793548584,
"kl": 0.014304678879852872,
"learning_rate": 5.5628612330087724e-08,
"loss": 0.0,
"num_tokens": 201050.0,
"reward": 0.25,
"reward_std": 0.4355512708425522,
"rewards/equation_reward_func/mean": 0.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.25,
"rewards/format_reward_func/std": 0.4355513006448746,
"step": 80
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 746.0,
"completions/max_terminated_length": 345.5,
"completions/mean_length": 174.6875,
"completions/mean_terminated_length": 116.39286041259766,
"completions/min_length": 50.5,
"completions/min_terminated_length": 50.5,
"epoch": 0.0001858032705907411,
"frac_reward_zero_std": 0.0,
"grad_norm": 13.687472343444824,
"kl": 0.012647167037357576,
"learning_rate": 4.5859084235697235e-08,
"loss": 0.0,
"num_tokens": 206125.0,
"reward": 0.25,
"reward_std": 0.4629100561141968,
"rewards/equation_reward_func/mean": 0.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.25,
"rewards/format_reward_func/std": 0.4629100561141968,
"step": 82
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 240.5,
"completions/max_terminated_length": 240.5,
"completions/mean_length": 104.75,
"completions/mean_terminated_length": 104.75,
"completions/min_length": 56.0,
"completions/min_terminated_length": 56.0,
"epoch": 0.00019033505767832015,
"frac_reward_zero_std": 0.0,
"grad_norm": 27.33205795288086,
"kl": 0.01843659658334218,
"learning_rate": 3.6945794086007705e-08,
"loss": 0.0,
"num_tokens": 210097.0,
"reward": 0.625,
"reward_std": 0.49871626496315,
"rewards/equation_reward_func/mean": 0.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.625,
"rewards/format_reward_func/std": 0.49871626496315,
"step": 84
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 516.5,
"completions/max_terminated_length": 516.5,
"completions/mean_length": 146.9375,
"completions/mean_terminated_length": 146.9375,
"completions/min_length": 45.0,
"completions/min_terminated_length": 45.0,
"epoch": 0.00019486684476589921,
"frac_reward_zero_std": 0.0,
"grad_norm": 17.247251510620117,
"kl": 0.014375057930010371,
"learning_rate": 2.892612731749414e-08,
"loss": 0.0,
"num_tokens": 214696.0,
"reward": 0.375,
"reward_std": 0.5175491571426392,
"rewards/equation_reward_func/mean": 0.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.375,
"rewards/format_reward_func/std": 0.5175492167472839,
"step": 86
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 176.0,
"completions/max_terminated_length": 176.0,
"completions/mean_length": 92.375,
"completions/mean_terminated_length": 92.375,
"completions/min_length": 39.0,
"completions/min_terminated_length": 39.0,
"epoch": 0.00019939863185347825,
"frac_reward_zero_std": 0.0,
"grad_norm": 12.001086235046387,
"kl": 0.015669465501559898,
"learning_rate": 2.183372119961499e-08,
"loss": 0.0,
"num_tokens": 218470.0,
"reward": 0.1875,
"reward_std": 0.408231720328331,
"rewards/equation_reward_func/mean": 0.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.1875,
"rewards/format_reward_func/std": 0.408231720328331,
"step": 88
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 522.0,
"completions/max_terminated_length": 522.0,
"completions/mean_length": 174.0,
"completions/mean_terminated_length": 174.0,
"completions/min_length": 55.5,
"completions/min_terminated_length": 55.5,
"epoch": 0.0002039304189410573,
"frac_reward_zero_std": 0.0,
"grad_norm": 11.844170570373535,
"kl": 0.009546459768898785,
"learning_rate": 1.5698323748414122e-08,
"loss": 0.0,
"num_tokens": 223534.0,
"reward": 0.375,
"reward_std": 0.4355512708425522,
"rewards/equation_reward_func/mean": 0.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.375,
"rewards/format_reward_func/std": 0.4355513006448746,
"step": 90
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 331.5,
"completions/max_terminated_length": 331.5,
"completions/mean_length": 151.8125,
"completions/mean_terminated_length": 151.8125,
"completions/min_length": 48.5,
"completions/min_terminated_length": 48.5,
"epoch": 0.00020846220602863637,
"frac_reward_zero_std": 0.0,
"grad_norm": 11.816337585449219,
"kl": 0.01593888070783578,
"learning_rate": 1.054566895300324e-08,
"loss": 0.0,
"num_tokens": 228243.0,
"reward": 0.5625,
"reward_std": 0.5260358154773712,
"rewards/equation_reward_func/mean": 0.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.5625,
"rewards/format_reward_func/std": 0.5260358452796936,
"step": 92
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 387.0,
"completions/max_terminated_length": 387.0,
"completions/mean_length": 163.6875,
"completions/mean_terminated_length": 163.6875,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"epoch": 0.0002129939931162154,
"frac_reward_zero_std": 0.0,
"grad_norm": 11.730420112609863,
"kl": 0.01412112163961865,
"learning_rate": 6.397368838268496e-09,
"loss": 0.0,
"num_tokens": 233110.0,
"reward": 0.1875,
"reward_std": 0.408231720328331,
"rewards/equation_reward_func/mean": 0.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.1875,
"rewards/format_reward_func/std": 0.408231720328331,
"step": 94
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 420.0,
"completions/max_terminated_length": 420.0,
"completions/mean_length": 192.4375,
"completions/mean_terminated_length": 192.4375,
"completions/min_length": 50.0,
"completions/min_terminated_length": 50.0,
"epoch": 0.00021752578020379446,
"frac_reward_zero_std": 0.5,
"grad_norm": 15.307268142700195,
"kl": 0.00981484999647364,
"learning_rate": 3.2708228165273244e-09,
"loss": 0.0,
"num_tokens": 238493.0,
"reward": 0.0625,
"reward_std": 0.1767766922712326,
"rewards/equation_reward_func/mean": 0.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0625,
"rewards/format_reward_func/std": 0.1767766922712326,
"step": 96
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 372.5,
"completions/max_terminated_length": 372.5,
"completions/mean_length": 108.1875,
"completions/mean_terminated_length": 108.1875,
"completions/min_length": 37.0,
"completions/min_terminated_length": 37.0,
"epoch": 0.00022205756729137352,
"frac_reward_zero_std": 0.0,
"grad_norm": 23.4942684173584,
"kl": 0.022864260390633717,
"learning_rate": 1.1791447083465133e-09,
"loss": 0.0,
"num_tokens": 242536.0,
"reward": 0.25,
"reward_std": 0.4629100561141968,
"rewards/equation_reward_func/mean": 0.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.25,
"rewards/format_reward_func/std": 0.4629100561141968,
"step": 98
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 426.0,
"completions/max_terminated_length": 426.0,
"completions/mean_length": 194.3125,
"completions/mean_terminated_length": 194.3125,
"completions/min_length": 37.0,
"completions/min_terminated_length": 37.0,
"epoch": 0.00022658935437895256,
"frac_reward_zero_std": 0.0,
"grad_norm": 23.61033058166504,
"kl": 0.0231838297622744,
"learning_rate": 1.3110773862126667e-10,
"loss": 0.0,
"num_tokens": 247917.0,
"reward": 0.4375,
"reward_std": 0.5260358154773712,
"rewards/equation_reward_func/mean": 0.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.4375,
"rewards/format_reward_func/std": 0.5260358452796936,
"step": 100
},
{
"epoch": 0.00022658935437895256,
"step": 100,
"total_flos": 0.0,
"train_loss": 8.04626297735922e-06,
"train_runtime": 2640.6632,
"train_samples_per_second": 0.303,
"train_steps_per_second": 0.038
}
],
"logging_steps": 2,
"max_steps": 100,
"num_input_tokens_seen": 247917,
"num_train_epochs": 1,
"save_steps": 25,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}