Model save

69cd9b6 verified about 1 month ago

52 kB

	{
	"best_global_step": null,
	"best_metric": null,
	"best_model_checkpoint": null,
	"epoch": 0.00022658935437895256,
	"eval_steps": 500,
	"global_step": 100,
	"is_hyper_param_search": false,
	"is_local_process_zero": true,
	"is_world_process_zero": true,
	"log_history": [
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0625,
	"completions/max_length": 656.0,
	"completions/max_terminated_length": 222.5,
	"completions/mean_length": 194.0625,
	"completions/mean_terminated_length": 136.94643020629883,
	"completions/min_length": 61.5,
	"completions/min_terminated_length": 61.5,
	"epoch": 4.531787087579051e-06,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 12.525100708007812,
	"kl": 0.0,
	"learning_rate": 1.6666666666666665e-07,
	"loss": -0.0,
	"num_tokens": 5377.0,
	"reward": 0.0625,
	"reward_std": 0.1767766922712326,
	"rewards/equation_reward_func/mean": 0.0,
	"rewards/equation_reward_func/std": 0.0,
	"rewards/format_reward_func/mean": 0.0625,
	"rewards/format_reward_func/std": 0.1767766922712326,
	"step": 2
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0625,
	"completions/max_length": 618.0,
	"completions/max_terminated_length": 198.0,
	"completions/mean_length": 171.25,
	"completions/mean_terminated_length": 114.93750381469727,
	"completions/min_length": 57.5,
	"completions/min_terminated_length": 57.5,
	"epoch": 9.063574175158102e-06,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.001167318900115788,
	"kl": 0.0009191570134134963,
	"learning_rate": 5e-07,
	"loss": 0.0,
	"num_tokens": 10397.0,
	"reward": 0.0625,
	"reward_std": 0.1767766922712326,
	"rewards/equation_reward_func/mean": 0.0,
	"rewards/equation_reward_func/std": 0.0,
	"rewards/format_reward_func/mean": 0.0625,
	"rewards/format_reward_func/std": 0.1767766922712326,
	"step": 4
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 477.0,
	"completions/max_terminated_length": 477.0,
	"completions/mean_length": 175.5,
	"completions/mean_terminated_length": 175.5,
	"completions/min_length": 38.5,
	"completions/min_terminated_length": 38.5,
	"epoch": 1.3595361262737154e-05,
	"frac_reward_zero_std": 1.0,
	"grad_norm": 0.0017322486964985728,
	"kl": 0.0009386140663991682,
	"learning_rate": 4.994757065594279e-07,
	"loss": 0.0,
	"num_tokens": 15493.0,
	"reward": 0.0,
	"reward_std": 0.0,
	"rewards/equation_reward_func/mean": 0.0,
	"rewards/equation_reward_func/std": 0.0,
	"rewards/format_reward_func/mean": 0.0,
	"rewards/format_reward_func/std": 0.0,
	"step": 6
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 307.0,
	"completions/max_terminated_length": 307.0,
	"completions/mean_length": 156.25,
	"completions/mean_terminated_length": 156.25,
	"completions/min_length": 47.0,
	"completions/min_terminated_length": 47.0,
	"epoch": 1.8127148350316204e-05,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 11.28031063079834,
	"kl": 0.0007646345866305637,
	"learning_rate": 4.979050253066063e-07,
	"loss": 0.0,
	"num_tokens": 20313.0,
	"reward": 0.0625,
	"reward_std": 0.1767766922712326,
	"rewards/equation_reward_func/mean": 0.0,
	"rewards/equation_reward_func/std": 0.0,
	"rewards/format_reward_func/mean": 0.0625,
	"rewards/format_reward_func/std": 0.1767766922712326,
	"step": 8
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 312.5,
	"completions/max_terminated_length": 312.5,
	"completions/mean_length": 152.25,
	"completions/mean_terminated_length": 152.25,
	"completions/min_length": 65.0,
	"completions/min_terminated_length": 65.0,
	"epoch": 2.2658935437895258e-05,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 9.902769088745117,
	"kl": 0.0008557607743568951,
	"learning_rate": 4.952945442245597e-07,
	"loss": 0.0,
	"num_tokens": 25061.0,
	"reward": 0.0625,
	"reward_std": 0.1767766922712326,
	"rewards/equation_reward_func/mean": 0.0,
	"rewards/equation_reward_func/std": 0.0,
	"rewards/format_reward_func/mean": 0.0625,
	"rewards/format_reward_func/std": 0.1767766922712326,
	"step": 10
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 543.5,
	"completions/max_terminated_length": 543.5,
	"completions/mean_length": 164.4375,
	"completions/mean_terminated_length": 164.4375,
	"completions/min_length": 38.0,
	"completions/min_terminated_length": 38.0,
	"epoch": 2.7190722525474308e-05,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.0027156081050634384,
	"kl": 0.0010605865281831939,
	"learning_rate": 4.916552125781528e-07,
	"loss": 0.0,
	"num_tokens": 29980.0,
	"reward": 0.0625,
	"reward_std": 0.1767766922712326,
	"rewards/equation_reward_func/mean": 0.0,
	"rewards/equation_reward_func/std": 0.0,
	"rewards/format_reward_func/mean": 0.0625,
	"rewards/format_reward_func/std": 0.1767766922712326,
	"step": 12
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 451.0,
	"completions/max_terminated_length": 451.0,
	"completions/mean_length": 211.0,
	"completions/mean_terminated_length": 211.0,
	"completions/min_length": 70.5,
	"completions/min_terminated_length": 70.5,
	"epoch": 3.172250961305336e-05,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 12.591176986694336,
	"kl": 0.0009377936348755611,
	"learning_rate": 4.870022949890676e-07,
	"loss": 0.0,
	"num_tokens": 35676.0,
	"reward": 0.0625,
	"reward_std": 0.1767766922712326,
	"rewards/equation_reward_func/mean": 0.0,
	"rewards/equation_reward_func/std": 0.0,
	"rewards/format_reward_func/mean": 0.0625,
	"rewards/format_reward_func/std": 0.1767766922712326,
	"step": 14
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 465.5,
	"completions/max_terminated_length": 465.5,
	"completions/mean_length": 163.375,
	"completions/mean_terminated_length": 163.375,
	"completions/min_length": 49.0,
	"completions/min_terminated_length": 49.0,
	"epoch": 3.625429670063241e-05,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 10.893583297729492,
	"kl": 0.001199566273498931,
	"learning_rate": 4.81355307410676e-07,
	"loss": 0.0,
	"num_tokens": 40570.0,
	"reward": 0.125,
	"reward_std": 0.3535533845424652,
	"rewards/equation_reward_func/mean": 0.0,
	"rewards/equation_reward_func/std": 0.0,
	"rewards/format_reward_func/mean": 0.125,
	"rewards/format_reward_func/std": 0.3535533845424652,
	"step": 16
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 401.0,
	"completions/max_terminated_length": 401.0,
	"completions/mean_length": 209.75,
	"completions/mean_terminated_length": 209.75,
	"completions/min_length": 63.0,
	"completions/min_terminated_length": 63.0,
	"epoch": 4.078608378821146e-05,
	"frac_reward_zero_std": 1.0,
	"grad_norm": 0.0009183982037939131,
	"kl": 0.0008469254862575326,
	"learning_rate": 4.747379352713488e-07,
	"loss": 0.0,
	"num_tokens": 46174.0,
	"reward": 0.0,
	"reward_std": 0.0,
	"rewards/equation_reward_func/mean": 0.0,
	"rewards/equation_reward_func/std": 0.0,
	"rewards/format_reward_func/mean": 0.0,
	"rewards/format_reward_func/std": 0.0,
	"step": 18
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 583.0,
	"completions/max_terminated_length": 583.0,
	"completions/mean_length": 211.125,
	"completions/mean_terminated_length": 211.125,
	"completions/min_length": 69.0,
	"completions/min_terminated_length": 69.0,
	"epoch": 4.5317870875790515e-05,
	"frac_reward_zero_std": 1.0,
	"grad_norm": 0.0012323512928560376,
	"kl": 0.0012486951891332865,
	"learning_rate": 4.6717793412953776e-07,
	"loss": 0.0,
	"num_tokens": 51832.0,
	"reward": 0.0,
	"reward_std": 0.0,
	"rewards/equation_reward_func/mean": 0.0,
	"rewards/equation_reward_func/std": 0.0,
	"rewards/format_reward_func/mean": 0.0,
	"rewards/format_reward_func/std": 0.0,
	"step": 20
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0625,
	"completions/max_length": 620.5,
	"completions/max_terminated_length": 285.5,
	"completions/mean_length": 202.9375,
	"completions/mean_terminated_length": 150.90178680419922,
	"completions/min_length": 60.5,
	"completions/min_terminated_length": 60.5,
	"epoch": 4.984965796336956e-05,
	"frac_reward_zero_std": 1.0,
	"grad_norm": 0.0016945754177868366,
	"kl": 0.00123220352907083,
	"learning_rate": 4.5870701325731773e-07,
	"loss": 0.0,
	"num_tokens": 57327.0,
	"reward": 0.0,
	"reward_std": 0.0,
	"rewards/equation_reward_func/mean": 0.0,
	"rewards/equation_reward_func/std": 0.0,
	"rewards/format_reward_func/mean": 0.0,
	"rewards/format_reward_func/std": 0.0,
	"step": 22
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0625,
	"completions/max_length": 683.0,
	"completions/max_terminated_length": 434.5,
	"completions/mean_length": 257.0,
	"completions/mean_terminated_length": 209.44644165039062,
	"completions/min_length": 84.5,
	"completions/min_terminated_length": 84.5,
	"epoch": 5.4381445050948616e-05,
	"frac_reward_zero_std": 1.0,
	"grad_norm": 0.00123355642426759,
	"kl": 0.0011914248134416994,
	"learning_rate": 4.4936070264068016e-07,
	"loss": 0.0,
	"num_tokens": 63719.0,
	"reward": 0.0,
	"reward_std": 0.0,
	"rewards/equation_reward_func/mean": 0.0,
	"rewards/equation_reward_func/std": 0.0,
	"rewards/format_reward_func/mean": 0.0,
	"rewards/format_reward_func/std": 0.0,
	"step": 24
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0625,
	"completions/max_length": 641.0,
	"completions/max_terminated_length": 235.5,
	"completions/mean_length": 181.3125,
	"completions/mean_terminated_length": 125.05357360839844,
	"completions/min_length": 35.0,
	"completions/min_terminated_length": 35.0,
	"epoch": 5.891323213852767e-05,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 13.299399375915527,
	"kl": 0.0018970294222526718,
	"learning_rate": 4.391782039544238e-07,
	"loss": 0.0,
	"num_tokens": 68924.0,
	"reward": 0.125,
	"reward_std": 0.3535533845424652,
	"rewards/equation_reward_func/mean": 0.0,
	"rewards/equation_reward_func/std": 0.0,
	"rewards/format_reward_func/mean": 0.125,
	"rewards/format_reward_func/std": 0.3535533845424652,
	"step": 26
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 214.5,
	"completions/max_terminated_length": 214.5,
	"completions/mean_length": 108.6875,
	"completions/mean_terminated_length": 108.6875,
	"completions/min_length": 34.0,
	"completions/min_terminated_length": 34.0,
	"epoch": 6.344501922610672e-05,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.0035000136122107506,
	"kl": 0.0019113743601337774,
	"learning_rate": 4.282022261367073e-07,
	"loss": 0.0,
	"num_tokens": 72919.0,
	"reward": 0.0625,
	"reward_std": 0.1767766922712326,
	"rewards/equation_reward_func/mean": 0.0,
	"rewards/equation_reward_func/std": 0.0,
	"rewards/format_reward_func/mean": 0.0625,
	"rewards/format_reward_func/std": 0.1767766922712326,
	"step": 28
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 583.5,
	"completions/max_terminated_length": 583.5,
	"completions/mean_length": 217.625,
	"completions/mean_terminated_length": 217.625,
	"completions/min_length": 37.0,
	"completions/min_terminated_length": 37.0,
	"epoch": 6.797680631368577e-05,
	"frac_reward_zero_std": 1.0,
	"grad_norm": 0.0021501195151358843,
	"kl": 0.0015104188605619129,
	"learning_rate": 4.1647880625292027e-07,
	"loss": 0.0,
	"num_tokens": 78713.0,
	"reward": 0.0,
	"reward_std": 0.0,
	"rewards/equation_reward_func/mean": 0.0,
	"rewards/equation_reward_func/std": 0.0,
	"rewards/format_reward_func/mean": 0.0,
	"rewards/format_reward_func/std": 0.0,
	"step": 30
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 335.5,
	"completions/max_terminated_length": 335.5,
	"completions/mean_length": 134.75,
	"completions/mean_terminated_length": 134.75,
	"completions/min_length": 32.0,
	"completions/min_terminated_length": 32.0,
	"epoch": 7.250859340126482e-05,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 14.151634216308594,
	"kl": 0.0026571638591121882,
	"learning_rate": 4.040571164002318e-07,
	"loss": 0.0,
	"num_tokens": 83149.0,
	"reward": 0.1875,
	"reward_std": 0.408231720328331,
	"rewards/equation_reward_func/mean": 0.0,
	"rewards/equation_reward_func/std": 0.0,
	"rewards/format_reward_func/mean": 0.1875,
	"rewards/format_reward_func/std": 0.408231720328331,
	"step": 32
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 381.5,
	"completions/max_terminated_length": 381.5,
	"completions/mean_length": 190.625,
	"completions/mean_terminated_length": 190.625,
	"completions/min_length": 34.5,
	"completions/min_terminated_length": 34.5,
	"epoch": 7.704038048884388e-05,
	"frac_reward_zero_std": 1.0,
	"grad_norm": 0.006240386515855789,
	"kl": 0.0022764305977034383,
	"learning_rate": 3.909892574627266e-07,
	"loss": 0.0,
	"num_tokens": 88487.0,
	"reward": 0.0,
	"reward_std": 0.0,
	"rewards/equation_reward_func/mean": 0.0,
	"rewards/equation_reward_func/std": 0.0,
	"rewards/format_reward_func/mean": 0.0,
	"rewards/format_reward_func/std": 0.0,
	"step": 34
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 745.0,
	"completions/max_terminated_length": 745.0,
	"completions/mean_length": 219.75,
	"completions/mean_terminated_length": 219.75,
	"completions/min_length": 67.5,
	"completions/min_terminated_length": 67.5,
	"epoch": 8.157216757642292e-05,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 16.2641544342041,
	"kl": 0.002281889770529233,
	"learning_rate": 3.773300405821908e-07,
	"loss": 0.0,
	"num_tokens": 94251.0,
	"reward": 0.125,
	"reward_std": 0.3535533845424652,
	"rewards/equation_reward_func/mean": 0.0,
	"rewards/equation_reward_func/std": 0.0,
	"rewards/format_reward_func/mean": 0.125,
	"rewards/format_reward_func/std": 0.3535533845424652,
	"step": 36
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 436.0,
	"completions/max_terminated_length": 436.0,
	"completions/mean_length": 172.5,
	"completions/mean_terminated_length": 172.5,
	"completions/min_length": 66.0,
	"completions/min_terminated_length": 66.0,
	"epoch": 8.610395466400197e-05,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 15.585210800170898,
	"kl": 0.010958031325571937,
	"learning_rate": 3.6313675726113475e-07,
	"loss": 0.0,
	"num_tokens": 99331.0,
	"reward": 0.125,
	"reward_std": 0.2314550280570984,
	"rewards/equation_reward_func/mean": 0.0,
	"rewards/equation_reward_func/std": 0.0,
	"rewards/format_reward_func/mean": 0.125,
	"rewards/format_reward_func/std": 0.2314550280570984,
	"step": 38
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 644.5,
	"completions/max_terminated_length": 644.5,
	"completions/mean_length": 238.0,
	"completions/mean_terminated_length": 238.0,
	"completions/min_length": 64.0,
	"completions/min_terminated_length": 64.0,
	"epoch": 9.063574175158103e-05,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.0022101891227066517,
	"kl": 0.003324320729007013,
	"learning_rate": 3.484689390623218e-07,
	"loss": 0.0,
	"num_tokens": 105419.0,
	"reward": 0.125,
	"reward_std": 0.2314550280570984,
	"rewards/equation_reward_func/mean": 0.0,
	"rewards/equation_reward_func/std": 0.0,
	"rewards/format_reward_func/mean": 0.125,
	"rewards/format_reward_func/std": 0.2314550280570984,
	"step": 40
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 427.0,
	"completions/max_terminated_length": 427.0,
	"completions/mean_length": 203.3125,
	"completions/mean_terminated_length": 203.3125,
	"completions/min_length": 67.0,
	"completions/min_terminated_length": 67.0,
	"epoch": 9.516752883916008e-05,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 16.7834529876709,
	"kl": 0.002571089873526944,
	"learning_rate": 3.3338810791270517e-07,
	"loss": 0.0,
	"num_tokens": 110992.0,
	"reward": 0.0625,
	"reward_std": 0.1767766922712326,
	"rewards/equation_reward_func/mean": 0.0,
	"rewards/equation_reward_func/std": 0.0,
	"rewards/format_reward_func/mean": 0.0625,
	"rewards/format_reward_func/std": 0.1767766922712326,
	"step": 42
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 267.5,
	"completions/max_terminated_length": 267.5,
	"completions/mean_length": 119.25,
	"completions/mean_terminated_length": 119.25,
	"completions/min_length": 55.5,
	"completions/min_terminated_length": 55.5,
	"epoch": 9.969931592673912e-05,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 13.29578685760498,
	"kl": 0.0029905991395935416,
	"learning_rate": 3.179575180590857e-07,
	"loss": 0.0,
	"num_tokens": 115204.0,
	"reward": 0.125,
	"reward_std": 0.3535533845424652,
	"rewards/equation_reward_func/mean": 0.0,
	"rewards/equation_reward_func/std": 0.0,
	"rewards/format_reward_func/mean": 0.125,
	"rewards/format_reward_func/std": 0.3535533845424652,
	"step": 44
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 326.5,
	"completions/max_terminated_length": 326.5,
	"completions/mean_length": 152.5,
	"completions/mean_terminated_length": 152.5,
	"completions/min_length": 26.0,
	"completions/min_terminated_length": 26.0,
	"epoch": 0.00010423110301431818,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 11.780332565307617,
	"kl": 0.004898008599411696,
	"learning_rate": 3.022418907578188e-07,
	"loss": 0.0,
	"num_tokens": 119916.0,
	"reward": 0.0625,
	"reward_std": 0.1767766922712326,
	"rewards/equation_reward_func/mean": 0.0,
	"rewards/equation_reward_func/std": 0.0,
	"rewards/format_reward_func/mean": 0.0625,
	"rewards/format_reward_func/std": 0.1767766922712326,
	"step": 46
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 756.5,
	"completions/max_terminated_length": 756.5,
	"completions/mean_length": 237.625,
	"completions/mean_terminated_length": 237.625,
	"completions/min_length": 35.0,
	"completions/min_terminated_length": 35.0,
	"epoch": 0.00010876289010189723,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 14.306221008300781,
	"kl": 0.004835129271668848,
	"learning_rate": 2.863071428113726e-07,
	"loss": 0.0,
	"num_tokens": 125990.0,
	"reward": 0.1875,
	"reward_std": 0.408231720328331,
	"rewards/equation_reward_func/mean": 0.0,
	"rewards/equation_reward_func/std": 0.0,
	"rewards/format_reward_func/mean": 0.1875,
	"rewards/format_reward_func/std": 0.408231720328331,
	"step": 48
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 405.5,
	"completions/max_terminated_length": 405.5,
	"completions/mean_length": 189.9375,
	"completions/mean_terminated_length": 189.9375,
	"completions/min_length": 67.0,
	"completions/min_terminated_length": 67.0,
	"epoch": 0.00011329467718947628,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 10.71300983428955,
	"kl": 0.006452581874327734,
	"learning_rate": 2.7022011009035107e-07,
	"loss": 0.0,
	"num_tokens": 131301.0,
	"reward": 0.125,
	"reward_std": 0.3535533845424652,
	"rewards/equation_reward_func/mean": 0.0,
	"rewards/equation_reward_func/std": 0.0,
	"rewards/format_reward_func/mean": 0.125,
	"rewards/format_reward_func/std": 0.3535533845424652,
	"step": 50
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 361.0,
	"completions/max_terminated_length": 361.0,
	"completions/mean_length": 157.0625,
	"completions/mean_terminated_length": 157.0625,
	"completions/min_length": 59.0,
	"completions/min_terminated_length": 59.0,
	"epoch": 0.00011782646427705534,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 11.726592063903809,
	"kl": 0.0094611946187797,
	"learning_rate": 2.540482672006254e-07,
	"loss": 0.0,
	"num_tokens": 136102.0,
	"reward": 0.25,
	"reward_std": 0.26726123690605164,
	"rewards/equation_reward_func/mean": 0.0,
	"rewards/equation_reward_func/std": 0.0,
	"rewards/format_reward_func/mean": 0.25,
	"rewards/format_reward_func/std": 0.26726123690605164,
	"step": 52
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 213.0,
	"completions/max_terminated_length": 213.0,
	"completions/mean_length": 117.6875,
	"completions/mean_terminated_length": 117.6875,
	"completions/min_length": 44.5,
	"completions/min_terminated_length": 44.5,
	"epoch": 0.00012235825136463439,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 13.030102729797363,
	"kl": 0.006068318209145218,
	"learning_rate": 2.37859444471388e-07,
	"loss": 0.0,
	"num_tokens": 140241.0,
	"reward": 0.0625,
	"reward_std": 0.1767766922712326,
	"rewards/equation_reward_func/mean": 0.0,
	"rewards/equation_reward_func/std": 0.0,
	"rewards/format_reward_func/mean": 0.0625,
	"rewards/format_reward_func/std": 0.1767766922712326,
	"step": 54
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 333.0,
	"completions/max_terminated_length": 333.0,
	"completions/mean_length": 136.25,
	"completions/mean_terminated_length": 136.25,
	"completions/min_length": 36.0,
	"completions/min_terminated_length": 36.0,
	"epoch": 0.00012689003845221345,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 10.891292572021484,
	"kl": 0.009872130773146637,
	"learning_rate": 2.2172154345117894e-07,
	"loss": 0.0,
	"num_tokens": 144701.0,
	"reward": 0.25,
	"reward_std": 0.4355512708425522,
	"rewards/equation_reward_func/mean": 0.0,
	"rewards/equation_reward_func/std": 0.0,
	"rewards/format_reward_func/mean": 0.25,
	"rewards/format_reward_func/std": 0.4355513006448746,
	"step": 56
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 663.5,
	"completions/max_terminated_length": 663.5,
	"completions/mean_length": 196.625,
	"completions/mean_terminated_length": 196.625,
	"completions/min_length": 59.5,
	"completions/min_terminated_length": 59.5,
	"epoch": 0.00013142182553979248,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 11.184815406799316,
	"kl": 0.01285637664841488,
	"learning_rate": 2.0570225210519433e-07,
	"loss": 0.0,
	"num_tokens": 150159.0,
	"reward": 0.1875,
	"reward_std": 0.408231720328331,
	"rewards/equation_reward_func/mean": 0.0,
	"rewards/equation_reward_func/std": 0.0,
	"rewards/format_reward_func/mean": 0.1875,
	"rewards/format_reward_func/std": 0.408231720328331,
	"step": 58
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 308.0,
	"completions/max_terminated_length": 308.0,
	"completions/mean_length": 140.4375,
	"completions/mean_terminated_length": 140.4375,
	"completions/min_length": 50.0,
	"completions/min_terminated_length": 50.0,
	"epoch": 0.00013595361262737154,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.02632570080459118,
	"kl": 0.014614543215429876,
	"learning_rate": 1.8986876090843664e-07,
	"loss": 0.0,
	"num_tokens": 154654.0,
	"reward": 0.1875,
	"reward_std": 0.2587745785713196,
	"rewards/equation_reward_func/mean": 0.0,
	"rewards/equation_reward_func/std": 0.0,
	"rewards/format_reward_func/mean": 0.1875,
	"rewards/format_reward_func/std": 0.25877460837364197,
	"step": 60
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 270.0,
	"completions/max_terminated_length": 270.0,
	"completions/mean_length": 104.0625,
	"completions/mean_terminated_length": 104.0625,
	"completions/min_length": 38.0,
	"completions/min_terminated_length": 38.0,
	"epoch": 0.0001404853997149506,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 11.164044380187988,
	"kl": 0.007626835664268583,
	"learning_rate": 1.7428748102551234e-07,
	"loss": 0.0,
	"num_tokens": 158599.0,
	"reward": 0.625,
	"reward_std": 0.49871626496315,
	"rewards/equation_reward_func/mean": 0.0,
	"rewards/equation_reward_func/std": 0.0,
	"rewards/format_reward_func/mean": 0.625,
	"rewards/format_reward_func/std": 0.49871626496315,
	"step": 62
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 160.5,
	"completions/max_terminated_length": 160.5,
	"completions/mean_length": 90.625,
	"completions/mean_terminated_length": 90.625,
	"completions/min_length": 47.0,
	"completions/min_terminated_length": 47.0,
	"epoch": 0.00014501718680252963,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 16.527629852294922,
	"kl": 0.011474673578049988,
	"learning_rate": 1.5902376575912814e-07,
	"loss": 0.0,
	"num_tokens": 162289.0,
	"reward": 0.3125,
	"reward_std": 0.44403792917728424,
	"rewards/equation_reward_func/mean": 0.0,
	"rewards/equation_reward_func/std": 0.0,
	"rewards/format_reward_func/mean": 0.3125,
	"rewards/format_reward_func/std": 0.44403792917728424,
	"step": 64
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 398.5,
	"completions/max_terminated_length": 398.5,
	"completions/mean_length": 129.0625,
	"completions/mean_terminated_length": 129.0625,
	"completions/min_length": 47.0,
	"completions/min_terminated_length": 47.0,
	"epoch": 0.0001495489738901087,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 12.219941139221191,
	"kl": 0.009613552174414508,
	"learning_rate": 1.4414163643562753e-07,
	"loss": 0.0,
	"num_tokens": 166674.0,
	"reward": 0.25,
	"reward_std": 0.4355512708425522,
	"rewards/equation_reward_func/mean": 0.0,
	"rewards/equation_reward_func/std": 0.0,
	"rewards/format_reward_func/mean": 0.25,
	"rewards/format_reward_func/std": 0.4355513006448746,
	"step": 66
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 244.5,
	"completions/max_terminated_length": 244.5,
	"completions/mean_length": 109.1875,
	"completions/mean_terminated_length": 109.1875,
	"completions/min_length": 45.5,
	"completions/min_terminated_length": 45.5,
	"epoch": 0.00015408076097768775,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 20.217954635620117,
	"kl": 0.020021719275973737,
	"learning_rate": 1.2970351387729872e-07,
	"loss": 0.0,
	"num_tokens": 170693.0,
	"reward": 0.375,
	"reward_std": 0.5175491571426392,
	"rewards/equation_reward_func/mean": 0.0,
	"rewards/equation_reward_func/std": 0.0,
	"rewards/format_reward_func/mean": 0.375,
	"rewards/format_reward_func/std": 0.5175492167472839,
	"step": 68
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 329.0,
	"completions/max_terminated_length": 329.0,
	"completions/mean_length": 127.0625,
	"completions/mean_terminated_length": 127.0625,
	"completions/min_length": 45.0,
	"completions/min_terminated_length": 45.0,
	"epoch": 0.0001586125480652668,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 12.027848243713379,
	"kl": 0.01639675306796562,
	"learning_rate": 1.1576995658775404e-07,
	"loss": 0.0,
	"num_tokens": 175014.0,
	"reward": 0.4375,
	"reward_std": 0.5260358154773712,
	"rewards/equation_reward_func/mean": 0.0,
	"rewards/equation_reward_func/std": 0.0,
	"rewards/format_reward_func/mean": 0.4375,
	"rewards/format_reward_func/std": 0.5260358452796936,
	"step": 70
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 375.0,
	"completions/max_terminated_length": 375.0,
	"completions/mean_length": 142.1875,
	"completions/mean_terminated_length": 142.1875,
	"completions/min_length": 52.0,
	"completions/min_terminated_length": 52.0,
	"epoch": 0.00016314433515284585,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 10.927833557128906,
	"kl": 0.011894080380443484,
	"learning_rate": 1.0239940674851941e-07,
	"loss": 0.0,
	"num_tokens": 179577.0,
	"reward": 0.25,
	"reward_std": 0.4629100561141968,
	"rewards/equation_reward_func/mean": 0.0,
	"rewards/equation_reward_func/std": 0.0,
	"rewards/format_reward_func/mean": 0.25,
	"rewards/format_reward_func/std": 0.4629100561141968,
	"step": 72
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 951.0,
	"completions/max_terminated_length": 951.0,
	"completions/mean_length": 273.6875,
	"completions/mean_terminated_length": 273.6875,
	"completions/min_length": 41.5,
	"completions/min_terminated_length": 41.5,
	"epoch": 0.0001676761222404249,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 6.783898830413818,
	"kl": 0.014139190083369613,
	"learning_rate": 8.964794509221507e-08,
	"loss": 0.0,
	"num_tokens": 186236.0,
	"reward": 0.375,
	"reward_std": 0.4355512708425522,
	"rewards/equation_reward_func/mean": 0.0,
	"rewards/equation_reward_func/std": 0.0,
	"rewards/format_reward_func/mean": 0.375,
	"rewards/format_reward_func/std": 0.4355513006448746,
	"step": 74
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 174.5,
	"completions/max_terminated_length": 174.5,
	"completions/mean_length": 106.25,
	"completions/mean_terminated_length": 106.25,
	"completions/min_length": 44.0,
	"completions/min_terminated_length": 44.0,
	"epoch": 0.00017220790932800394,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 12.524798393249512,
	"kl": 0.013552291362429969,
	"learning_rate": 7.756905568047392e-08,
	"loss": 0.0,
	"num_tokens": 190168.0,
	"reward": 0.5625,
	"reward_std": 0.5260358154773712,
	"rewards/equation_reward_func/mean": 0.0,
	"rewards/equation_reward_func/std": 0.0,
	"rewards/format_reward_func/mean": 0.5625,
	"rewards/format_reward_func/std": 0.5260358452796936,
	"step": 76
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 647.5,
	"completions/max_terminated_length": 647.5,
	"completions/mean_length": 207.5625,
	"completions/mean_terminated_length": 207.5625,
	"completions/min_length": 46.0,
	"completions/min_terminated_length": 46.0,
	"epoch": 0.000176739696415583,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 10.13427448272705,
	"kl": 0.01421075320104137,
	"learning_rate": 6.621340157319996e-08,
	"loss": 0.0,
	"num_tokens": 195793.0,
	"reward": 0.375,
	"reward_std": 0.4355512708425522,
	"rewards/equation_reward_func/mean": 0.0,
	"rewards/equation_reward_func/std": 0.0,
	"rewards/format_reward_func/mean": 0.375,
	"rewards/format_reward_func/std": 0.4355513006448746,
	"step": 78
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0625,
	"completions/max_length": 620.0,
	"completions/max_terminated_length": 281.0,
	"completions/mean_length": 187.5625,
	"completions/mean_terminated_length": 134.85714721679688,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 0.00018127148350316206,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 16.1104793548584,
	"kl": 0.014304678879852872,
	"learning_rate": 5.5628612330087724e-08,
	"loss": 0.0,
	"num_tokens": 201050.0,
	"reward": 0.25,
	"reward_std": 0.4355512708425522,
	"rewards/equation_reward_func/mean": 0.0,
	"rewards/equation_reward_func/std": 0.0,
	"rewards/format_reward_func/mean": 0.25,
	"rewards/format_reward_func/std": 0.4355513006448746,
	"step": 80
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0625,
	"completions/max_length": 746.0,
	"completions/max_terminated_length": 345.5,
	"completions/mean_length": 174.6875,
	"completions/mean_terminated_length": 116.39286041259766,
	"completions/min_length": 50.5,
	"completions/min_terminated_length": 50.5,
	"epoch": 0.0001858032705907411,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 13.687472343444824,
	"kl": 0.012647167037357576,
	"learning_rate": 4.5859084235697235e-08,
	"loss": 0.0,
	"num_tokens": 206125.0,
	"reward": 0.25,
	"reward_std": 0.4629100561141968,
	"rewards/equation_reward_func/mean": 0.0,
	"rewards/equation_reward_func/std": 0.0,
	"rewards/format_reward_func/mean": 0.25,
	"rewards/format_reward_func/std": 0.4629100561141968,
	"step": 82
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 240.5,
	"completions/max_terminated_length": 240.5,
	"completions/mean_length": 104.75,
	"completions/mean_terminated_length": 104.75,
	"completions/min_length": 56.0,
	"completions/min_terminated_length": 56.0,
	"epoch": 0.00019033505767832015,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 27.33205795288086,
	"kl": 0.01843659658334218,
	"learning_rate": 3.6945794086007705e-08,
	"loss": 0.0,
	"num_tokens": 210097.0,
	"reward": 0.625,
	"reward_std": 0.49871626496315,
	"rewards/equation_reward_func/mean": 0.0,
	"rewards/equation_reward_func/std": 0.0,
	"rewards/format_reward_func/mean": 0.625,
	"rewards/format_reward_func/std": 0.49871626496315,
	"step": 84
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 516.5,
	"completions/max_terminated_length": 516.5,
	"completions/mean_length": 146.9375,
	"completions/mean_terminated_length": 146.9375,
	"completions/min_length": 45.0,
	"completions/min_terminated_length": 45.0,
	"epoch": 0.00019486684476589921,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 17.247251510620117,
	"kl": 0.014375057930010371,
	"learning_rate": 2.892612731749414e-08,
	"loss": 0.0,
	"num_tokens": 214696.0,
	"reward": 0.375,
	"reward_std": 0.5175491571426392,
	"rewards/equation_reward_func/mean": 0.0,
	"rewards/equation_reward_func/std": 0.0,
	"rewards/format_reward_func/mean": 0.375,
	"rewards/format_reward_func/std": 0.5175492167472839,
	"step": 86
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 176.0,
	"completions/max_terminated_length": 176.0,
	"completions/mean_length": 92.375,
	"completions/mean_terminated_length": 92.375,
	"completions/min_length": 39.0,
	"completions/min_terminated_length": 39.0,
	"epoch": 0.00019939863185347825,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 12.001086235046387,
	"kl": 0.015669465501559898,
	"learning_rate": 2.183372119961499e-08,
	"loss": 0.0,
	"num_tokens": 218470.0,
	"reward": 0.1875,
	"reward_std": 0.408231720328331,
	"rewards/equation_reward_func/mean": 0.0,
	"rewards/equation_reward_func/std": 0.0,
	"rewards/format_reward_func/mean": 0.1875,
	"rewards/format_reward_func/std": 0.408231720328331,
	"step": 88
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 522.0,
	"completions/max_terminated_length": 522.0,
	"completions/mean_length": 174.0,
	"completions/mean_terminated_length": 174.0,
	"completions/min_length": 55.5,
	"completions/min_terminated_length": 55.5,
	"epoch": 0.0002039304189410573,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 11.844170570373535,
	"kl": 0.009546459768898785,
	"learning_rate": 1.5698323748414122e-08,
	"loss": 0.0,
	"num_tokens": 223534.0,
	"reward": 0.375,
	"reward_std": 0.4355512708425522,
	"rewards/equation_reward_func/mean": 0.0,
	"rewards/equation_reward_func/std": 0.0,
	"rewards/format_reward_func/mean": 0.375,
	"rewards/format_reward_func/std": 0.4355513006448746,
	"step": 90
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 331.5,
	"completions/max_terminated_length": 331.5,
	"completions/mean_length": 151.8125,
	"completions/mean_terminated_length": 151.8125,
	"completions/min_length": 48.5,
	"completions/min_terminated_length": 48.5,
	"epoch": 0.00020846220602863637,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 11.816337585449219,
	"kl": 0.01593888070783578,
	"learning_rate": 1.054566895300324e-08,
	"loss": 0.0,
	"num_tokens": 228243.0,
	"reward": 0.5625,
	"reward_std": 0.5260358154773712,
	"rewards/equation_reward_func/mean": 0.0,
	"rewards/equation_reward_func/std": 0.0,
	"rewards/format_reward_func/mean": 0.5625,
	"rewards/format_reward_func/std": 0.5260358452796936,
	"step": 92
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 387.0,
	"completions/max_terminated_length": 387.0,
	"completions/mean_length": 163.6875,
	"completions/mean_terminated_length": 163.6875,
	"completions/min_length": 33.0,
	"completions/min_terminated_length": 33.0,
	"epoch": 0.0002129939931162154,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 11.730420112609863,
	"kl": 0.01412112163961865,
	"learning_rate": 6.397368838268496e-09,
	"loss": 0.0,
	"num_tokens": 233110.0,
	"reward": 0.1875,
	"reward_std": 0.408231720328331,
	"rewards/equation_reward_func/mean": 0.0,
	"rewards/equation_reward_func/std": 0.0,
	"rewards/format_reward_func/mean": 0.1875,
	"rewards/format_reward_func/std": 0.408231720328331,
	"step": 94
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 420.0,
	"completions/max_terminated_length": 420.0,
	"completions/mean_length": 192.4375,
	"completions/mean_terminated_length": 192.4375,
	"completions/min_length": 50.0,
	"completions/min_terminated_length": 50.0,
	"epoch": 0.00021752578020379446,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 15.307268142700195,
	"kl": 0.00981484999647364,
	"learning_rate": 3.2708228165273244e-09,
	"loss": 0.0,
	"num_tokens": 238493.0,
	"reward": 0.0625,
	"reward_std": 0.1767766922712326,
	"rewards/equation_reward_func/mean": 0.0,
	"rewards/equation_reward_func/std": 0.0,
	"rewards/format_reward_func/mean": 0.0625,
	"rewards/format_reward_func/std": 0.1767766922712326,
	"step": 96
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 372.5,
	"completions/max_terminated_length": 372.5,
	"completions/mean_length": 108.1875,
	"completions/mean_terminated_length": 108.1875,
	"completions/min_length": 37.0,
	"completions/min_terminated_length": 37.0,
	"epoch": 0.00022205756729137352,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 23.4942684173584,
	"kl": 0.022864260390633717,
	"learning_rate": 1.1791447083465133e-09,
	"loss": 0.0,
	"num_tokens": 242536.0,
	"reward": 0.25,
	"reward_std": 0.4629100561141968,
	"rewards/equation_reward_func/mean": 0.0,
	"rewards/equation_reward_func/std": 0.0,
	"rewards/format_reward_func/mean": 0.25,
	"rewards/format_reward_func/std": 0.4629100561141968,
	"step": 98
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 426.0,
	"completions/max_terminated_length": 426.0,
	"completions/mean_length": 194.3125,
	"completions/mean_terminated_length": 194.3125,
	"completions/min_length": 37.0,
	"completions/min_terminated_length": 37.0,
	"epoch": 0.00022658935437895256,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 23.61033058166504,
	"kl": 0.0231838297622744,
	"learning_rate": 1.3110773862126667e-10,
	"loss": 0.0,
	"num_tokens": 247917.0,
	"reward": 0.4375,
	"reward_std": 0.5260358154773712,
	"rewards/equation_reward_func/mean": 0.0,
	"rewards/equation_reward_func/std": 0.0,
	"rewards/format_reward_func/mean": 0.4375,
	"rewards/format_reward_func/std": 0.5260358452796936,
	"step": 100
	},
	{
	"epoch": 0.00022658935437895256,
	"step": 100,
	"total_flos": 0.0,
	"train_loss": 8.04626297735922e-06,
	"train_runtime": 2640.6632,
	"train_samples_per_second": 0.303,
	"train_steps_per_second": 0.038
	}
	],
	"logging_steps": 2,
	"max_steps": 100,
	"num_input_tokens_seen": 247917,
	"num_train_epochs": 1,
	"save_steps": 25,
	"stateful_callbacks": {
	"TrainerControl": {
	"args": {
	"should_epoch_stop": false,
	"should_evaluate": false,
	"should_log": false,
	"should_save": true,
	"should_training_stop": true
	},
	"attributes": {}
	}
	},
	"total_flos": 0.0,
	"train_batch_size": 1,
	"trial_name": null,
	"trial_params": null
	}