{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.00022658935437895256, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 656.0, "completions/max_terminated_length": 222.5, "completions/mean_length": 194.0625, "completions/mean_terminated_length": 136.94643020629883, "completions/min_length": 61.5, "completions/min_terminated_length": 61.5, "epoch": 4.531787087579051e-06, "frac_reward_zero_std": 0.5, "grad_norm": 12.525100708007812, "kl": 0.0, "learning_rate": 1.6666666666666665e-07, "loss": -0.0, "num_tokens": 5377.0, "reward": 0.0625, "reward_std": 0.1767766922712326, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.1767766922712326, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 618.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 171.25, "completions/mean_terminated_length": 114.93750381469727, "completions/min_length": 57.5, "completions/min_terminated_length": 57.5, "epoch": 9.063574175158102e-06, "frac_reward_zero_std": 0.5, "grad_norm": 0.001167318900115788, "kl": 0.0009191570134134963, "learning_rate": 5e-07, "loss": 0.0, "num_tokens": 10397.0, "reward": 0.0625, "reward_std": 0.1767766922712326, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.1767766922712326, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 175.5, "completions/mean_terminated_length": 175.5, "completions/min_length": 38.5, "completions/min_terminated_length": 38.5, "epoch": 1.3595361262737154e-05, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017322486964985728, "kl": 0.0009386140663991682, "learning_rate": 4.994757065594279e-07, "loss": 0.0, "num_tokens": 15493.0, "reward": 0.0, "reward_std": 0.0, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 156.25, "completions/mean_terminated_length": 156.25, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 1.8127148350316204e-05, "frac_reward_zero_std": 0.5, "grad_norm": 11.28031063079834, "kl": 0.0007646345866305637, "learning_rate": 4.979050253066063e-07, "loss": 0.0, "num_tokens": 20313.0, "reward": 0.0625, "reward_std": 0.1767766922712326, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.1767766922712326, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.5, "completions/max_terminated_length": 312.5, "completions/mean_length": 152.25, "completions/mean_terminated_length": 152.25, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 2.2658935437895258e-05, "frac_reward_zero_std": 0.5, "grad_norm": 9.902769088745117, "kl": 0.0008557607743568951, "learning_rate": 4.952945442245597e-07, "loss": 0.0, "num_tokens": 25061.0, "reward": 0.0625, "reward_std": 0.1767766922712326, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.1767766922712326, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.5, "completions/max_terminated_length": 543.5, "completions/mean_length": 164.4375, "completions/mean_terminated_length": 164.4375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 2.7190722525474308e-05, "frac_reward_zero_std": 0.5, "grad_norm": 0.0027156081050634384, "kl": 0.0010605865281831939, "learning_rate": 4.916552125781528e-07, "loss": 0.0, "num_tokens": 29980.0, "reward": 0.0625, "reward_std": 0.1767766922712326, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.1767766922712326, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 211.0, "completions/mean_terminated_length": 211.0, "completions/min_length": 70.5, "completions/min_terminated_length": 70.5, "epoch": 3.172250961305336e-05, "frac_reward_zero_std": 0.5, "grad_norm": 12.591176986694336, "kl": 0.0009377936348755611, "learning_rate": 4.870022949890676e-07, "loss": 0.0, "num_tokens": 35676.0, "reward": 0.0625, "reward_std": 0.1767766922712326, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.1767766922712326, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.5, "completions/max_terminated_length": 465.5, "completions/mean_length": 163.375, "completions/mean_terminated_length": 163.375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 3.625429670063241e-05, "frac_reward_zero_std": 0.0, "grad_norm": 10.893583297729492, "kl": 0.001199566273498931, "learning_rate": 4.81355307410676e-07, "loss": 0.0, "num_tokens": 40570.0, "reward": 0.125, "reward_std": 0.3535533845424652, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.3535533845424652, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 209.75, "completions/mean_terminated_length": 209.75, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 4.078608378821146e-05, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009183982037939131, "kl": 0.0008469254862575326, "learning_rate": 4.747379352713488e-07, "loss": 0.0, "num_tokens": 46174.0, "reward": 0.0, "reward_std": 0.0, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/max_terminated_length": 583.0, "completions/mean_length": 211.125, "completions/mean_terminated_length": 211.125, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 4.5317870875790515e-05, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012323512928560376, "kl": 0.0012486951891332865, "learning_rate": 4.6717793412953776e-07, "loss": 0.0, "num_tokens": 51832.0, "reward": 0.0, "reward_std": 0.0, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 620.5, "completions/max_terminated_length": 285.5, "completions/mean_length": 202.9375, "completions/mean_terminated_length": 150.90178680419922, "completions/min_length": 60.5, "completions/min_terminated_length": 60.5, "epoch": 4.984965796336956e-05, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016945754177868366, "kl": 0.00123220352907083, "learning_rate": 4.5870701325731773e-07, "loss": 0.0, "num_tokens": 57327.0, "reward": 0.0, "reward_std": 0.0, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 683.0, "completions/max_terminated_length": 434.5, "completions/mean_length": 257.0, "completions/mean_terminated_length": 209.44644165039062, "completions/min_length": 84.5, "completions/min_terminated_length": 84.5, "epoch": 5.4381445050948616e-05, "frac_reward_zero_std": 1.0, "grad_norm": 0.00123355642426759, "kl": 0.0011914248134416994, "learning_rate": 4.4936070264068016e-07, "loss": 0.0, "num_tokens": 63719.0, "reward": 0.0, "reward_std": 0.0, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 641.0, "completions/max_terminated_length": 235.5, "completions/mean_length": 181.3125, "completions/mean_terminated_length": 125.05357360839844, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 5.891323213852767e-05, "frac_reward_zero_std": 0.0, "grad_norm": 13.299399375915527, "kl": 0.0018970294222526718, "learning_rate": 4.391782039544238e-07, "loss": 0.0, "num_tokens": 68924.0, "reward": 0.125, "reward_std": 0.3535533845424652, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.3535533845424652, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.5, "completions/max_terminated_length": 214.5, "completions/mean_length": 108.6875, "completions/mean_terminated_length": 108.6875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 6.344501922610672e-05, "frac_reward_zero_std": 0.5, "grad_norm": 0.0035000136122107506, "kl": 0.0019113743601337774, "learning_rate": 4.282022261367073e-07, "loss": 0.0, "num_tokens": 72919.0, "reward": 0.0625, "reward_std": 0.1767766922712326, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.1767766922712326, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.5, "completions/max_terminated_length": 583.5, "completions/mean_length": 217.625, "completions/mean_terminated_length": 217.625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 6.797680631368577e-05, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021501195151358843, "kl": 0.0015104188605619129, "learning_rate": 4.1647880625292027e-07, "loss": 0.0, "num_tokens": 78713.0, "reward": 0.0, "reward_std": 0.0, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.5, "completions/max_terminated_length": 335.5, "completions/mean_length": 134.75, "completions/mean_terminated_length": 134.75, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 7.250859340126482e-05, "frac_reward_zero_std": 0.0, "grad_norm": 14.151634216308594, "kl": 0.0026571638591121882, "learning_rate": 4.040571164002318e-07, "loss": 0.0, "num_tokens": 83149.0, "reward": 0.1875, "reward_std": 0.408231720328331, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.408231720328331, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.5, "completions/max_terminated_length": 381.5, "completions/mean_length": 190.625, "completions/mean_terminated_length": 190.625, "completions/min_length": 34.5, "completions/min_terminated_length": 34.5, "epoch": 7.704038048884388e-05, "frac_reward_zero_std": 1.0, "grad_norm": 0.006240386515855789, "kl": 0.0022764305977034383, "learning_rate": 3.909892574627266e-07, "loss": 0.0, "num_tokens": 88487.0, "reward": 0.0, "reward_std": 0.0, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 745.0, "completions/max_terminated_length": 745.0, "completions/mean_length": 219.75, "completions/mean_terminated_length": 219.75, "completions/min_length": 67.5, "completions/min_terminated_length": 67.5, "epoch": 8.157216757642292e-05, "frac_reward_zero_std": 0.0, "grad_norm": 16.2641544342041, "kl": 0.002281889770529233, "learning_rate": 3.773300405821908e-07, "loss": 0.0, "num_tokens": 94251.0, "reward": 0.125, "reward_std": 0.3535533845424652, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.3535533845424652, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 172.5, "completions/mean_terminated_length": 172.5, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 8.610395466400197e-05, "frac_reward_zero_std": 0.5, "grad_norm": 15.585210800170898, "kl": 0.010958031325571937, "learning_rate": 3.6313675726113475e-07, "loss": 0.0, "num_tokens": 99331.0, "reward": 0.125, "reward_std": 0.2314550280570984, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.2314550280570984, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 644.5, "completions/max_terminated_length": 644.5, "completions/mean_length": 238.0, "completions/mean_terminated_length": 238.0, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 9.063574175158103e-05, "frac_reward_zero_std": 0.5, "grad_norm": 0.0022101891227066517, "kl": 0.003324320729007013, "learning_rate": 3.484689390623218e-07, "loss": 0.0, "num_tokens": 105419.0, "reward": 0.125, "reward_std": 0.2314550280570984, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.2314550280570984, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 203.3125, "completions/mean_terminated_length": 203.3125, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 9.516752883916008e-05, "frac_reward_zero_std": 0.5, "grad_norm": 16.7834529876709, "kl": 0.002571089873526944, "learning_rate": 3.3338810791270517e-07, "loss": 0.0, "num_tokens": 110992.0, "reward": 0.0625, "reward_std": 0.1767766922712326, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.1767766922712326, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.5, "completions/max_terminated_length": 267.5, "completions/mean_length": 119.25, "completions/mean_terminated_length": 119.25, "completions/min_length": 55.5, "completions/min_terminated_length": 55.5, "epoch": 9.969931592673912e-05, "frac_reward_zero_std": 0.0, "grad_norm": 13.29578685760498, "kl": 0.0029905991395935416, "learning_rate": 3.179575180590857e-07, "loss": 0.0, "num_tokens": 115204.0, "reward": 0.125, "reward_std": 0.3535533845424652, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.3535533845424652, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.5, "completions/max_terminated_length": 326.5, "completions/mean_length": 152.5, "completions/mean_terminated_length": 152.5, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.00010423110301431818, "frac_reward_zero_std": 0.5, "grad_norm": 11.780332565307617, "kl": 0.004898008599411696, "learning_rate": 3.022418907578188e-07, "loss": 0.0, "num_tokens": 119916.0, "reward": 0.0625, "reward_std": 0.1767766922712326, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.1767766922712326, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 756.5, "completions/max_terminated_length": 756.5, "completions/mean_length": 237.625, "completions/mean_terminated_length": 237.625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.00010876289010189723, "frac_reward_zero_std": 0.0, "grad_norm": 14.306221008300781, "kl": 0.004835129271668848, "learning_rate": 2.863071428113726e-07, "loss": 0.0, "num_tokens": 125990.0, "reward": 0.1875, "reward_std": 0.408231720328331, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.408231720328331, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.5, "completions/max_terminated_length": 405.5, "completions/mean_length": 189.9375, "completions/mean_terminated_length": 189.9375, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.00011329467718947628, "frac_reward_zero_std": 0.0, "grad_norm": 10.71300983428955, "kl": 0.006452581874327734, "learning_rate": 2.7022011009035107e-07, "loss": 0.0, "num_tokens": 131301.0, "reward": 0.125, "reward_std": 0.3535533845424652, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.3535533845424652, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 157.0625, "completions/mean_terminated_length": 157.0625, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.00011782646427705534, "frac_reward_zero_std": 0.5, "grad_norm": 11.726592063903809, "kl": 0.0094611946187797, "learning_rate": 2.540482672006254e-07, "loss": 0.0, "num_tokens": 136102.0, "reward": 0.25, "reward_std": 0.26726123690605164, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.26726123690605164, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 117.6875, "completions/mean_terminated_length": 117.6875, "completions/min_length": 44.5, "completions/min_terminated_length": 44.5, "epoch": 0.00012235825136463439, "frac_reward_zero_std": 0.5, "grad_norm": 13.030102729797363, "kl": 0.006068318209145218, "learning_rate": 2.37859444471388e-07, "loss": 0.0, "num_tokens": 140241.0, "reward": 0.0625, "reward_std": 0.1767766922712326, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.1767766922712326, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 136.25, "completions/mean_terminated_length": 136.25, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.00012689003845221345, "frac_reward_zero_std": 0.0, "grad_norm": 10.891292572021484, "kl": 0.009872130773146637, "learning_rate": 2.2172154345117894e-07, "loss": 0.0, "num_tokens": 144701.0, "reward": 0.25, "reward_std": 0.4355512708425522, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4355513006448746, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 663.5, "completions/max_terminated_length": 663.5, "completions/mean_length": 196.625, "completions/mean_terminated_length": 196.625, "completions/min_length": 59.5, "completions/min_terminated_length": 59.5, "epoch": 0.00013142182553979248, "frac_reward_zero_std": 0.0, "grad_norm": 11.184815406799316, "kl": 0.01285637664841488, "learning_rate": 2.0570225210519433e-07, "loss": 0.0, "num_tokens": 150159.0, "reward": 0.1875, "reward_std": 0.408231720328331, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.408231720328331, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 140.4375, "completions/mean_terminated_length": 140.4375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.00013595361262737154, "frac_reward_zero_std": 0.5, "grad_norm": 0.02632570080459118, "kl": 0.014614543215429876, "learning_rate": 1.8986876090843664e-07, "loss": 0.0, "num_tokens": 154654.0, "reward": 0.1875, "reward_std": 0.2587745785713196, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.25877460837364197, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 104.0625, "completions/mean_terminated_length": 104.0625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.0001404853997149506, "frac_reward_zero_std": 0.0, "grad_norm": 11.164044380187988, "kl": 0.007626835664268583, "learning_rate": 1.7428748102551234e-07, "loss": 0.0, "num_tokens": 158599.0, "reward": 0.625, "reward_std": 0.49871626496315, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.625, "rewards/format_reward_func/std": 0.49871626496315, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.5, "completions/max_terminated_length": 160.5, "completions/mean_length": 90.625, "completions/mean_terminated_length": 90.625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.00014501718680252963, "frac_reward_zero_std": 0.0, "grad_norm": 16.527629852294922, "kl": 0.011474673578049988, "learning_rate": 1.5902376575912814e-07, "loss": 0.0, "num_tokens": 162289.0, "reward": 0.3125, "reward_std": 0.44403792917728424, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.44403792917728424, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.5, "completions/max_terminated_length": 398.5, "completions/mean_length": 129.0625, "completions/mean_terminated_length": 129.0625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.0001495489738901087, "frac_reward_zero_std": 0.0, "grad_norm": 12.219941139221191, "kl": 0.009613552174414508, "learning_rate": 1.4414163643562753e-07, "loss": 0.0, "num_tokens": 166674.0, "reward": 0.25, "reward_std": 0.4355512708425522, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4355513006448746, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.5, "completions/max_terminated_length": 244.5, "completions/mean_length": 109.1875, "completions/mean_terminated_length": 109.1875, "completions/min_length": 45.5, "completions/min_terminated_length": 45.5, "epoch": 0.00015408076097768775, "frac_reward_zero_std": 0.0, "grad_norm": 20.217954635620117, "kl": 0.020021719275973737, "learning_rate": 1.2970351387729872e-07, "loss": 0.0, "num_tokens": 170693.0, "reward": 0.375, "reward_std": 0.5175491571426392, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.5175492167472839, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 127.0625, "completions/mean_terminated_length": 127.0625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.0001586125480652668, "frac_reward_zero_std": 0.0, "grad_norm": 12.027848243713379, "kl": 0.01639675306796562, "learning_rate": 1.1576995658775404e-07, "loss": 0.0, "num_tokens": 175014.0, "reward": 0.4375, "reward_std": 0.5260358154773712, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.5260358452796936, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 142.1875, "completions/mean_terminated_length": 142.1875, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.00016314433515284585, "frac_reward_zero_std": 0.0, "grad_norm": 10.927833557128906, "kl": 0.011894080380443484, "learning_rate": 1.0239940674851941e-07, "loss": 0.0, "num_tokens": 179577.0, "reward": 0.25, "reward_std": 0.4629100561141968, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4629100561141968, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 951.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 273.6875, "completions/mean_terminated_length": 273.6875, "completions/min_length": 41.5, "completions/min_terminated_length": 41.5, "epoch": 0.0001676761222404249, "frac_reward_zero_std": 0.0, "grad_norm": 6.783898830413818, "kl": 0.014139190083369613, "learning_rate": 8.964794509221507e-08, "loss": 0.0, "num_tokens": 186236.0, "reward": 0.375, "reward_std": 0.4355512708425522, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.4355513006448746, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.5, "completions/max_terminated_length": 174.5, "completions/mean_length": 106.25, "completions/mean_terminated_length": 106.25, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.00017220790932800394, "frac_reward_zero_std": 0.0, "grad_norm": 12.524798393249512, "kl": 0.013552291362429969, "learning_rate": 7.756905568047392e-08, "loss": 0.0, "num_tokens": 190168.0, "reward": 0.5625, "reward_std": 0.5260358154773712, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.5260358452796936, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 647.5, "completions/max_terminated_length": 647.5, "completions/mean_length": 207.5625, "completions/mean_terminated_length": 207.5625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.000176739696415583, "frac_reward_zero_std": 0.0, "grad_norm": 10.13427448272705, "kl": 0.01421075320104137, "learning_rate": 6.621340157319996e-08, "loss": 0.0, "num_tokens": 195793.0, "reward": 0.375, "reward_std": 0.4355512708425522, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.4355513006448746, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 620.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 187.5625, "completions/mean_terminated_length": 134.85714721679688, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.00018127148350316206, "frac_reward_zero_std": 0.0, "grad_norm": 16.1104793548584, "kl": 0.014304678879852872, "learning_rate": 5.5628612330087724e-08, "loss": 0.0, "num_tokens": 201050.0, "reward": 0.25, "reward_std": 0.4355512708425522, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4355513006448746, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 746.0, "completions/max_terminated_length": 345.5, "completions/mean_length": 174.6875, "completions/mean_terminated_length": 116.39286041259766, "completions/min_length": 50.5, "completions/min_terminated_length": 50.5, "epoch": 0.0001858032705907411, "frac_reward_zero_std": 0.0, "grad_norm": 13.687472343444824, "kl": 0.012647167037357576, "learning_rate": 4.5859084235697235e-08, "loss": 0.0, "num_tokens": 206125.0, "reward": 0.25, "reward_std": 0.4629100561141968, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4629100561141968, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.5, "completions/max_terminated_length": 240.5, "completions/mean_length": 104.75, "completions/mean_terminated_length": 104.75, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.00019033505767832015, "frac_reward_zero_std": 0.0, "grad_norm": 27.33205795288086, "kl": 0.01843659658334218, "learning_rate": 3.6945794086007705e-08, "loss": 0.0, "num_tokens": 210097.0, "reward": 0.625, "reward_std": 0.49871626496315, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.625, "rewards/format_reward_func/std": 0.49871626496315, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.5, "completions/max_terminated_length": 516.5, "completions/mean_length": 146.9375, "completions/mean_terminated_length": 146.9375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.00019486684476589921, "frac_reward_zero_std": 0.0, "grad_norm": 17.247251510620117, "kl": 0.014375057930010371, "learning_rate": 2.892612731749414e-08, "loss": 0.0, "num_tokens": 214696.0, "reward": 0.375, "reward_std": 0.5175491571426392, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.5175492167472839, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 92.375, "completions/mean_terminated_length": 92.375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.00019939863185347825, "frac_reward_zero_std": 0.0, "grad_norm": 12.001086235046387, "kl": 0.015669465501559898, "learning_rate": 2.183372119961499e-08, "loss": 0.0, "num_tokens": 218470.0, "reward": 0.1875, "reward_std": 0.408231720328331, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.408231720328331, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 174.0, "completions/mean_terminated_length": 174.0, "completions/min_length": 55.5, "completions/min_terminated_length": 55.5, "epoch": 0.0002039304189410573, "frac_reward_zero_std": 0.0, "grad_norm": 11.844170570373535, "kl": 0.009546459768898785, "learning_rate": 1.5698323748414122e-08, "loss": 0.0, "num_tokens": 223534.0, "reward": 0.375, "reward_std": 0.4355512708425522, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.4355513006448746, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.5, "completions/max_terminated_length": 331.5, "completions/mean_length": 151.8125, "completions/mean_terminated_length": 151.8125, "completions/min_length": 48.5, "completions/min_terminated_length": 48.5, "epoch": 0.00020846220602863637, "frac_reward_zero_std": 0.0, "grad_norm": 11.816337585449219, "kl": 0.01593888070783578, "learning_rate": 1.054566895300324e-08, "loss": 0.0, "num_tokens": 228243.0, "reward": 0.5625, "reward_std": 0.5260358154773712, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.5260358452796936, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 163.6875, "completions/mean_terminated_length": 163.6875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.0002129939931162154, "frac_reward_zero_std": 0.0, "grad_norm": 11.730420112609863, "kl": 0.01412112163961865, "learning_rate": 6.397368838268496e-09, "loss": 0.0, "num_tokens": 233110.0, "reward": 0.1875, "reward_std": 0.408231720328331, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.408231720328331, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 192.4375, "completions/mean_terminated_length": 192.4375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.00021752578020379446, "frac_reward_zero_std": 0.5, "grad_norm": 15.307268142700195, "kl": 0.00981484999647364, "learning_rate": 3.2708228165273244e-09, "loss": 0.0, "num_tokens": 238493.0, "reward": 0.0625, "reward_std": 0.1767766922712326, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.1767766922712326, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.5, "completions/max_terminated_length": 372.5, "completions/mean_length": 108.1875, "completions/mean_terminated_length": 108.1875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.00022205756729137352, "frac_reward_zero_std": 0.0, "grad_norm": 23.4942684173584, "kl": 0.022864260390633717, "learning_rate": 1.1791447083465133e-09, "loss": 0.0, "num_tokens": 242536.0, "reward": 0.25, "reward_std": 0.4629100561141968, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.4629100561141968, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 194.3125, "completions/mean_terminated_length": 194.3125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.00022658935437895256, "frac_reward_zero_std": 0.0, "grad_norm": 23.61033058166504, "kl": 0.0231838297622744, "learning_rate": 1.3110773862126667e-10, "loss": 0.0, "num_tokens": 247917.0, "reward": 0.4375, "reward_std": 0.5260358154773712, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.5260358452796936, "step": 100 }, { "epoch": 0.00022658935437895256, "step": 100, "total_flos": 0.0, "train_loss": 8.04626297735922e-06, "train_runtime": 2640.6632, "train_samples_per_second": 0.303, "train_steps_per_second": 0.038 } ], "logging_steps": 2, "max_steps": 100, "num_input_tokens_seen": 247917, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }