{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.00022658935437895256,
  "eval_steps": 500,
  "global_step": 100,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 656.0,
      "completions/max_terminated_length": 222.5,
      "completions/mean_length": 194.0625,
      "completions/mean_terminated_length": 136.94643020629883,
      "completions/min_length": 61.5,
      "completions/min_terminated_length": 61.5,
      "epoch": 4.531787087579051e-06,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 12.525100708007812,
      "kl": 0.0,
      "learning_rate": 1.6666666666666665e-07,
      "loss": -0.0,
      "num_tokens": 5377.0,
      "reward": 0.0625,
      "reward_std": 0.1767766922712326,
      "rewards/equation_reward_func/mean": 0.0,
      "rewards/equation_reward_func/std": 0.0,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "step": 2
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 618.0,
      "completions/max_terminated_length": 198.0,
      "completions/mean_length": 171.25,
      "completions/mean_terminated_length": 114.93750381469727,
      "completions/min_length": 57.5,
      "completions/min_terminated_length": 57.5,
      "epoch": 9.063574175158102e-06,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.001167318900115788,
      "kl": 0.0009191570134134963,
      "learning_rate": 5e-07,
      "loss": 0.0,
      "num_tokens": 10397.0,
      "reward": 0.0625,
      "reward_std": 0.1767766922712326,
      "rewards/equation_reward_func/mean": 0.0,
      "rewards/equation_reward_func/std": 0.0,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "step": 4
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 477.0,
      "completions/max_terminated_length": 477.0,
      "completions/mean_length": 175.5,
      "completions/mean_terminated_length": 175.5,
      "completions/min_length": 38.5,
      "completions/min_terminated_length": 38.5,
      "epoch": 1.3595361262737154e-05,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0017322486964985728,
      "kl": 0.0009386140663991682,
      "learning_rate": 4.994757065594279e-07,
      "loss": 0.0,
      "num_tokens": 15493.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/equation_reward_func/mean": 0.0,
      "rewards/equation_reward_func/std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "step": 6
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 307.0,
      "completions/max_terminated_length": 307.0,
      "completions/mean_length": 156.25,
      "completions/mean_terminated_length": 156.25,
      "completions/min_length": 47.0,
      "completions/min_terminated_length": 47.0,
      "epoch": 1.8127148350316204e-05,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 11.28031063079834,
      "kl": 0.0007646345866305637,
      "learning_rate": 4.979050253066063e-07,
      "loss": 0.0,
      "num_tokens": 20313.0,
      "reward": 0.0625,
      "reward_std": 0.1767766922712326,
      "rewards/equation_reward_func/mean": 0.0,
      "rewards/equation_reward_func/std": 0.0,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "step": 8
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 312.5,
      "completions/max_terminated_length": 312.5,
      "completions/mean_length": 152.25,
      "completions/mean_terminated_length": 152.25,
      "completions/min_length": 65.0,
      "completions/min_terminated_length": 65.0,
      "epoch": 2.2658935437895258e-05,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 9.902769088745117,
      "kl": 0.0008557607743568951,
      "learning_rate": 4.952945442245597e-07,
      "loss": 0.0,
      "num_tokens": 25061.0,
      "reward": 0.0625,
      "reward_std": 0.1767766922712326,
      "rewards/equation_reward_func/mean": 0.0,
      "rewards/equation_reward_func/std": 0.0,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "step": 10
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 543.5,
      "completions/max_terminated_length": 543.5,
      "completions/mean_length": 164.4375,
      "completions/mean_terminated_length": 164.4375,
      "completions/min_length": 38.0,
      "completions/min_terminated_length": 38.0,
      "epoch": 2.7190722525474308e-05,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.0027156081050634384,
      "kl": 0.0010605865281831939,
      "learning_rate": 4.916552125781528e-07,
      "loss": 0.0,
      "num_tokens": 29980.0,
      "reward": 0.0625,
      "reward_std": 0.1767766922712326,
      "rewards/equation_reward_func/mean": 0.0,
      "rewards/equation_reward_func/std": 0.0,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "step": 12
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 451.0,
      "completions/max_terminated_length": 451.0,
      "completions/mean_length": 211.0,
      "completions/mean_terminated_length": 211.0,
      "completions/min_length": 70.5,
      "completions/min_terminated_length": 70.5,
      "epoch": 3.172250961305336e-05,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 12.591176986694336,
      "kl": 0.0009377936348755611,
      "learning_rate": 4.870022949890676e-07,
      "loss": 0.0,
      "num_tokens": 35676.0,
      "reward": 0.0625,
      "reward_std": 0.1767766922712326,
      "rewards/equation_reward_func/mean": 0.0,
      "rewards/equation_reward_func/std": 0.0,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "step": 14
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 465.5,
      "completions/max_terminated_length": 465.5,
      "completions/mean_length": 163.375,
      "completions/mean_terminated_length": 163.375,
      "completions/min_length": 49.0,
      "completions/min_terminated_length": 49.0,
      "epoch": 3.625429670063241e-05,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.893583297729492,
      "kl": 0.001199566273498931,
      "learning_rate": 4.81355307410676e-07,
      "loss": 0.0,
      "num_tokens": 40570.0,
      "reward": 0.125,
      "reward_std": 0.3535533845424652,
      "rewards/equation_reward_func/mean": 0.0,
      "rewards/equation_reward_func/std": 0.0,
      "rewards/format_reward_func/mean": 0.125,
      "rewards/format_reward_func/std": 0.3535533845424652,
      "step": 16
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 209.75,
      "completions/mean_terminated_length": 209.75,
      "completions/min_length": 63.0,
      "completions/min_terminated_length": 63.0,
      "epoch": 4.078608378821146e-05,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009183982037939131,
      "kl": 0.0008469254862575326,
      "learning_rate": 4.747379352713488e-07,
      "loss": 0.0,
      "num_tokens": 46174.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/equation_reward_func/mean": 0.0,
      "rewards/equation_reward_func/std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "step": 18
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 583.0,
      "completions/max_terminated_length": 583.0,
      "completions/mean_length": 211.125,
      "completions/mean_terminated_length": 211.125,
      "completions/min_length": 69.0,
      "completions/min_terminated_length": 69.0,
      "epoch": 4.5317870875790515e-05,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0012323512928560376,
      "kl": 0.0012486951891332865,
      "learning_rate": 4.6717793412953776e-07,
      "loss": 0.0,
      "num_tokens": 51832.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/equation_reward_func/mean": 0.0,
      "rewards/equation_reward_func/std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "step": 20
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 620.5,
      "completions/max_terminated_length": 285.5,
      "completions/mean_length": 202.9375,
      "completions/mean_terminated_length": 150.90178680419922,
      "completions/min_length": 60.5,
      "completions/min_terminated_length": 60.5,
      "epoch": 4.984965796336956e-05,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016945754177868366,
      "kl": 0.00123220352907083,
      "learning_rate": 4.5870701325731773e-07,
      "loss": 0.0,
      "num_tokens": 57327.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/equation_reward_func/mean": 0.0,
      "rewards/equation_reward_func/std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "step": 22
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 683.0,
      "completions/max_terminated_length": 434.5,
      "completions/mean_length": 257.0,
      "completions/mean_terminated_length": 209.44644165039062,
      "completions/min_length": 84.5,
      "completions/min_terminated_length": 84.5,
      "epoch": 5.4381445050948616e-05,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00123355642426759,
      "kl": 0.0011914248134416994,
      "learning_rate": 4.4936070264068016e-07,
      "loss": 0.0,
      "num_tokens": 63719.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/equation_reward_func/mean": 0.0,
      "rewards/equation_reward_func/std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "step": 24
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 641.0,
      "completions/max_terminated_length": 235.5,
      "completions/mean_length": 181.3125,
      "completions/mean_terminated_length": 125.05357360839844,
      "completions/min_length": 35.0,
      "completions/min_terminated_length": 35.0,
      "epoch": 5.891323213852767e-05,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 13.299399375915527,
      "kl": 0.0018970294222526718,
      "learning_rate": 4.391782039544238e-07,
      "loss": 0.0,
      "num_tokens": 68924.0,
      "reward": 0.125,
      "reward_std": 0.3535533845424652,
      "rewards/equation_reward_func/mean": 0.0,
      "rewards/equation_reward_func/std": 0.0,
      "rewards/format_reward_func/mean": 0.125,
      "rewards/format_reward_func/std": 0.3535533845424652,
      "step": 26
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 214.5,
      "completions/max_terminated_length": 214.5,
      "completions/mean_length": 108.6875,
      "completions/mean_terminated_length": 108.6875,
      "completions/min_length": 34.0,
      "completions/min_terminated_length": 34.0,
      "epoch": 6.344501922610672e-05,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.0035000136122107506,
      "kl": 0.0019113743601337774,
      "learning_rate": 4.282022261367073e-07,
      "loss": 0.0,
      "num_tokens": 72919.0,
      "reward": 0.0625,
      "reward_std": 0.1767766922712326,
      "rewards/equation_reward_func/mean": 0.0,
      "rewards/equation_reward_func/std": 0.0,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "step": 28
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 583.5,
      "completions/max_terminated_length": 583.5,
      "completions/mean_length": 217.625,
      "completions/mean_terminated_length": 217.625,
      "completions/min_length": 37.0,
      "completions/min_terminated_length": 37.0,
      "epoch": 6.797680631368577e-05,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0021501195151358843,
      "kl": 0.0015104188605619129,
      "learning_rate": 4.1647880625292027e-07,
      "loss": 0.0,
      "num_tokens": 78713.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/equation_reward_func/mean": 0.0,
      "rewards/equation_reward_func/std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "step": 30
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 335.5,
      "completions/max_terminated_length": 335.5,
      "completions/mean_length": 134.75,
      "completions/mean_terminated_length": 134.75,
      "completions/min_length": 32.0,
      "completions/min_terminated_length": 32.0,
      "epoch": 7.250859340126482e-05,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 14.151634216308594,
      "kl": 0.0026571638591121882,
      "learning_rate": 4.040571164002318e-07,
      "loss": 0.0,
      "num_tokens": 83149.0,
      "reward": 0.1875,
      "reward_std": 0.408231720328331,
      "rewards/equation_reward_func/mean": 0.0,
      "rewards/equation_reward_func/std": 0.0,
      "rewards/format_reward_func/mean": 0.1875,
      "rewards/format_reward_func/std": 0.408231720328331,
      "step": 32
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 381.5,
      "completions/max_terminated_length": 381.5,
      "completions/mean_length": 190.625,
      "completions/mean_terminated_length": 190.625,
      "completions/min_length": 34.5,
      "completions/min_terminated_length": 34.5,
      "epoch": 7.704038048884388e-05,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006240386515855789,
      "kl": 0.0022764305977034383,
      "learning_rate": 3.909892574627266e-07,
      "loss": 0.0,
      "num_tokens": 88487.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/equation_reward_func/mean": 0.0,
      "rewards/equation_reward_func/std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "step": 34
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 745.0,
      "completions/max_terminated_length": 745.0,
      "completions/mean_length": 219.75,
      "completions/mean_terminated_length": 219.75,
      "completions/min_length": 67.5,
      "completions/min_terminated_length": 67.5,
      "epoch": 8.157216757642292e-05,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 16.2641544342041,
      "kl": 0.002281889770529233,
      "learning_rate": 3.773300405821908e-07,
      "loss": 0.0,
      "num_tokens": 94251.0,
      "reward": 0.125,
      "reward_std": 0.3535533845424652,
      "rewards/equation_reward_func/mean": 0.0,
      "rewards/equation_reward_func/std": 0.0,
      "rewards/format_reward_func/mean": 0.125,
      "rewards/format_reward_func/std": 0.3535533845424652,
      "step": 36
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 436.0,
      "completions/max_terminated_length": 436.0,
      "completions/mean_length": 172.5,
      "completions/mean_terminated_length": 172.5,
      "completions/min_length": 66.0,
      "completions/min_terminated_length": 66.0,
      "epoch": 8.610395466400197e-05,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 15.585210800170898,
      "kl": 0.010958031325571937,
      "learning_rate": 3.6313675726113475e-07,
      "loss": 0.0,
      "num_tokens": 99331.0,
      "reward": 0.125,
      "reward_std": 0.2314550280570984,
      "rewards/equation_reward_func/mean": 0.0,
      "rewards/equation_reward_func/std": 0.0,
      "rewards/format_reward_func/mean": 0.125,
      "rewards/format_reward_func/std": 0.2314550280570984,
      "step": 38
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 644.5,
      "completions/max_terminated_length": 644.5,
      "completions/mean_length": 238.0,
      "completions/mean_terminated_length": 238.0,
      "completions/min_length": 64.0,
      "completions/min_terminated_length": 64.0,
      "epoch": 9.063574175158103e-05,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.0022101891227066517,
      "kl": 0.003324320729007013,
      "learning_rate": 3.484689390623218e-07,
      "loss": 0.0,
      "num_tokens": 105419.0,
      "reward": 0.125,
      "reward_std": 0.2314550280570984,
      "rewards/equation_reward_func/mean": 0.0,
      "rewards/equation_reward_func/std": 0.0,
      "rewards/format_reward_func/mean": 0.125,
      "rewards/format_reward_func/std": 0.2314550280570984,
      "step": 40
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 427.0,
      "completions/max_terminated_length": 427.0,
      "completions/mean_length": 203.3125,
      "completions/mean_terminated_length": 203.3125,
      "completions/min_length": 67.0,
      "completions/min_terminated_length": 67.0,
      "epoch": 9.516752883916008e-05,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 16.7834529876709,
      "kl": 0.002571089873526944,
      "learning_rate": 3.3338810791270517e-07,
      "loss": 0.0,
      "num_tokens": 110992.0,
      "reward": 0.0625,
      "reward_std": 0.1767766922712326,
      "rewards/equation_reward_func/mean": 0.0,
      "rewards/equation_reward_func/std": 0.0,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "step": 42
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 267.5,
      "completions/max_terminated_length": 267.5,
      "completions/mean_length": 119.25,
      "completions/mean_terminated_length": 119.25,
      "completions/min_length": 55.5,
      "completions/min_terminated_length": 55.5,
      "epoch": 9.969931592673912e-05,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 13.29578685760498,
      "kl": 0.0029905991395935416,
      "learning_rate": 3.179575180590857e-07,
      "loss": 0.0,
      "num_tokens": 115204.0,
      "reward": 0.125,
      "reward_std": 0.3535533845424652,
      "rewards/equation_reward_func/mean": 0.0,
      "rewards/equation_reward_func/std": 0.0,
      "rewards/format_reward_func/mean": 0.125,
      "rewards/format_reward_func/std": 0.3535533845424652,
      "step": 44
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 326.5,
      "completions/max_terminated_length": 326.5,
      "completions/mean_length": 152.5,
      "completions/mean_terminated_length": 152.5,
      "completions/min_length": 26.0,
      "completions/min_terminated_length": 26.0,
      "epoch": 0.00010423110301431818,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 11.780332565307617,
      "kl": 0.004898008599411696,
      "learning_rate": 3.022418907578188e-07,
      "loss": 0.0,
      "num_tokens": 119916.0,
      "reward": 0.0625,
      "reward_std": 0.1767766922712326,
      "rewards/equation_reward_func/mean": 0.0,
      "rewards/equation_reward_func/std": 0.0,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "step": 46
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 756.5,
      "completions/max_terminated_length": 756.5,
      "completions/mean_length": 237.625,
      "completions/mean_terminated_length": 237.625,
      "completions/min_length": 35.0,
      "completions/min_terminated_length": 35.0,
      "epoch": 0.00010876289010189723,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 14.306221008300781,
      "kl": 0.004835129271668848,
      "learning_rate": 2.863071428113726e-07,
      "loss": 0.0,
      "num_tokens": 125990.0,
      "reward": 0.1875,
      "reward_std": 0.408231720328331,
      "rewards/equation_reward_func/mean": 0.0,
      "rewards/equation_reward_func/std": 0.0,
      "rewards/format_reward_func/mean": 0.1875,
      "rewards/format_reward_func/std": 0.408231720328331,
      "step": 48
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 405.5,
      "completions/max_terminated_length": 405.5,
      "completions/mean_length": 189.9375,
      "completions/mean_terminated_length": 189.9375,
      "completions/min_length": 67.0,
      "completions/min_terminated_length": 67.0,
      "epoch": 0.00011329467718947628,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.71300983428955,
      "kl": 0.006452581874327734,
      "learning_rate": 2.7022011009035107e-07,
      "loss": 0.0,
      "num_tokens": 131301.0,
      "reward": 0.125,
      "reward_std": 0.3535533845424652,
      "rewards/equation_reward_func/mean": 0.0,
      "rewards/equation_reward_func/std": 0.0,
      "rewards/format_reward_func/mean": 0.125,
      "rewards/format_reward_func/std": 0.3535533845424652,
      "step": 50
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 361.0,
      "completions/max_terminated_length": 361.0,
      "completions/mean_length": 157.0625,
      "completions/mean_terminated_length": 157.0625,
      "completions/min_length": 59.0,
      "completions/min_terminated_length": 59.0,
      "epoch": 0.00011782646427705534,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 11.726592063903809,
      "kl": 0.0094611946187797,
      "learning_rate": 2.540482672006254e-07,
      "loss": 0.0,
      "num_tokens": 136102.0,
      "reward": 0.25,
      "reward_std": 0.26726123690605164,
      "rewards/equation_reward_func/mean": 0.0,
      "rewards/equation_reward_func/std": 0.0,
      "rewards/format_reward_func/mean": 0.25,
      "rewards/format_reward_func/std": 0.26726123690605164,
      "step": 52
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 213.0,
      "completions/max_terminated_length": 213.0,
      "completions/mean_length": 117.6875,
      "completions/mean_terminated_length": 117.6875,
      "completions/min_length": 44.5,
      "completions/min_terminated_length": 44.5,
      "epoch": 0.00012235825136463439,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 13.030102729797363,
      "kl": 0.006068318209145218,
      "learning_rate": 2.37859444471388e-07,
      "loss": 0.0,
      "num_tokens": 140241.0,
      "reward": 0.0625,
      "reward_std": 0.1767766922712326,
      "rewards/equation_reward_func/mean": 0.0,
      "rewards/equation_reward_func/std": 0.0,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "step": 54
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 333.0,
      "completions/max_terminated_length": 333.0,
      "completions/mean_length": 136.25,
      "completions/mean_terminated_length": 136.25,
      "completions/min_length": 36.0,
      "completions/min_terminated_length": 36.0,
      "epoch": 0.00012689003845221345,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.891292572021484,
      "kl": 0.009872130773146637,
      "learning_rate": 2.2172154345117894e-07,
      "loss": 0.0,
      "num_tokens": 144701.0,
      "reward": 0.25,
      "reward_std": 0.4355512708425522,
      "rewards/equation_reward_func/mean": 0.0,
      "rewards/equation_reward_func/std": 0.0,
      "rewards/format_reward_func/mean": 0.25,
      "rewards/format_reward_func/std": 0.4355513006448746,
      "step": 56
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 663.5,
      "completions/max_terminated_length": 663.5,
      "completions/mean_length": 196.625,
      "completions/mean_terminated_length": 196.625,
      "completions/min_length": 59.5,
      "completions/min_terminated_length": 59.5,
      "epoch": 0.00013142182553979248,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 11.184815406799316,
      "kl": 0.01285637664841488,
      "learning_rate": 2.0570225210519433e-07,
      "loss": 0.0,
      "num_tokens": 150159.0,
      "reward": 0.1875,
      "reward_std": 0.408231720328331,
      "rewards/equation_reward_func/mean": 0.0,
      "rewards/equation_reward_func/std": 0.0,
      "rewards/format_reward_func/mean": 0.1875,
      "rewards/format_reward_func/std": 0.408231720328331,
      "step": 58
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 308.0,
      "completions/max_terminated_length": 308.0,
      "completions/mean_length": 140.4375,
      "completions/mean_terminated_length": 140.4375,
      "completions/min_length": 50.0,
      "completions/min_terminated_length": 50.0,
      "epoch": 0.00013595361262737154,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.02632570080459118,
      "kl": 0.014614543215429876,
      "learning_rate": 1.8986876090843664e-07,
      "loss": 0.0,
      "num_tokens": 154654.0,
      "reward": 0.1875,
      "reward_std": 0.2587745785713196,
      "rewards/equation_reward_func/mean": 0.0,
      "rewards/equation_reward_func/std": 0.0,
      "rewards/format_reward_func/mean": 0.1875,
      "rewards/format_reward_func/std": 0.25877460837364197,
      "step": 60
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 270.0,
      "completions/max_terminated_length": 270.0,
      "completions/mean_length": 104.0625,
      "completions/mean_terminated_length": 104.0625,
      "completions/min_length": 38.0,
      "completions/min_terminated_length": 38.0,
      "epoch": 0.0001404853997149506,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 11.164044380187988,
      "kl": 0.007626835664268583,
      "learning_rate": 1.7428748102551234e-07,
      "loss": 0.0,
      "num_tokens": 158599.0,
      "reward": 0.625,
      "reward_std": 0.49871626496315,
      "rewards/equation_reward_func/mean": 0.0,
      "rewards/equation_reward_func/std": 0.0,
      "rewards/format_reward_func/mean": 0.625,
      "rewards/format_reward_func/std": 0.49871626496315,
      "step": 62
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 160.5,
      "completions/max_terminated_length": 160.5,
      "completions/mean_length": 90.625,
      "completions/mean_terminated_length": 90.625,
      "completions/min_length": 47.0,
      "completions/min_terminated_length": 47.0,
      "epoch": 0.00014501718680252963,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 16.527629852294922,
      "kl": 0.011474673578049988,
      "learning_rate": 1.5902376575912814e-07,
      "loss": 0.0,
      "num_tokens": 162289.0,
      "reward": 0.3125,
      "reward_std": 0.44403792917728424,
      "rewards/equation_reward_func/mean": 0.0,
      "rewards/equation_reward_func/std": 0.0,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.44403792917728424,
      "step": 64
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 398.5,
      "completions/max_terminated_length": 398.5,
      "completions/mean_length": 129.0625,
      "completions/mean_terminated_length": 129.0625,
      "completions/min_length": 47.0,
      "completions/min_terminated_length": 47.0,
      "epoch": 0.0001495489738901087,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 12.219941139221191,
      "kl": 0.009613552174414508,
      "learning_rate": 1.4414163643562753e-07,
      "loss": 0.0,
      "num_tokens": 166674.0,
      "reward": 0.25,
      "reward_std": 0.4355512708425522,
      "rewards/equation_reward_func/mean": 0.0,
      "rewards/equation_reward_func/std": 0.0,
      "rewards/format_reward_func/mean": 0.25,
      "rewards/format_reward_func/std": 0.4355513006448746,
      "step": 66
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 244.5,
      "completions/max_terminated_length": 244.5,
      "completions/mean_length": 109.1875,
      "completions/mean_terminated_length": 109.1875,
      "completions/min_length": 45.5,
      "completions/min_terminated_length": 45.5,
      "epoch": 0.00015408076097768775,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 20.217954635620117,
      "kl": 0.020021719275973737,
      "learning_rate": 1.2970351387729872e-07,
      "loss": 0.0,
      "num_tokens": 170693.0,
      "reward": 0.375,
      "reward_std": 0.5175491571426392,
      "rewards/equation_reward_func/mean": 0.0,
      "rewards/equation_reward_func/std": 0.0,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.5175492167472839,
      "step": 68
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 329.0,
      "completions/max_terminated_length": 329.0,
      "completions/mean_length": 127.0625,
      "completions/mean_terminated_length": 127.0625,
      "completions/min_length": 45.0,
      "completions/min_terminated_length": 45.0,
      "epoch": 0.0001586125480652668,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 12.027848243713379,
      "kl": 0.01639675306796562,
      "learning_rate": 1.1576995658775404e-07,
      "loss": 0.0,
      "num_tokens": 175014.0,
      "reward": 0.4375,
      "reward_std": 0.5260358154773712,
      "rewards/equation_reward_func/mean": 0.0,
      "rewards/equation_reward_func/std": 0.0,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.5260358452796936,
      "step": 70
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 375.0,
      "completions/max_terminated_length": 375.0,
      "completions/mean_length": 142.1875,
      "completions/mean_terminated_length": 142.1875,
      "completions/min_length": 52.0,
      "completions/min_terminated_length": 52.0,
      "epoch": 0.00016314433515284585,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.927833557128906,
      "kl": 0.011894080380443484,
      "learning_rate": 1.0239940674851941e-07,
      "loss": 0.0,
      "num_tokens": 179577.0,
      "reward": 0.25,
      "reward_std": 0.4629100561141968,
      "rewards/equation_reward_func/mean": 0.0,
      "rewards/equation_reward_func/std": 0.0,
      "rewards/format_reward_func/mean": 0.25,
      "rewards/format_reward_func/std": 0.4629100561141968,
      "step": 72
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 951.0,
      "completions/max_terminated_length": 951.0,
      "completions/mean_length": 273.6875,
      "completions/mean_terminated_length": 273.6875,
      "completions/min_length": 41.5,
      "completions/min_terminated_length": 41.5,
      "epoch": 0.0001676761222404249,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 6.783898830413818,
      "kl": 0.014139190083369613,
      "learning_rate": 8.964794509221507e-08,
      "loss": 0.0,
      "num_tokens": 186236.0,
      "reward": 0.375,
      "reward_std": 0.4355512708425522,
      "rewards/equation_reward_func/mean": 0.0,
      "rewards/equation_reward_func/std": 0.0,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.4355513006448746,
      "step": 74
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 174.5,
      "completions/max_terminated_length": 174.5,
      "completions/mean_length": 106.25,
      "completions/mean_terminated_length": 106.25,
      "completions/min_length": 44.0,
      "completions/min_terminated_length": 44.0,
      "epoch": 0.00017220790932800394,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 12.524798393249512,
      "kl": 0.013552291362429969,
      "learning_rate": 7.756905568047392e-08,
      "loss": 0.0,
      "num_tokens": 190168.0,
      "reward": 0.5625,
      "reward_std": 0.5260358154773712,
      "rewards/equation_reward_func/mean": 0.0,
      "rewards/equation_reward_func/std": 0.0,
      "rewards/format_reward_func/mean": 0.5625,
      "rewards/format_reward_func/std": 0.5260358452796936,
      "step": 76
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 647.5,
      "completions/max_terminated_length": 647.5,
      "completions/mean_length": 207.5625,
      "completions/mean_terminated_length": 207.5625,
      "completions/min_length": 46.0,
      "completions/min_terminated_length": 46.0,
      "epoch": 0.000176739696415583,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.13427448272705,
      "kl": 0.01421075320104137,
      "learning_rate": 6.621340157319996e-08,
      "loss": 0.0,
      "num_tokens": 195793.0,
      "reward": 0.375,
      "reward_std": 0.4355512708425522,
      "rewards/equation_reward_func/mean": 0.0,
      "rewards/equation_reward_func/std": 0.0,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.4355513006448746,
      "step": 78
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 620.0,
      "completions/max_terminated_length": 281.0,
      "completions/mean_length": 187.5625,
      "completions/mean_terminated_length": 134.85714721679688,
      "completions/min_length": 40.0,
      "completions/min_terminated_length": 40.0,
      "epoch": 0.00018127148350316206,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 16.1104793548584,
      "kl": 0.014304678879852872,
      "learning_rate": 5.5628612330087724e-08,
      "loss": 0.0,
      "num_tokens": 201050.0,
      "reward": 0.25,
      "reward_std": 0.4355512708425522,
      "rewards/equation_reward_func/mean": 0.0,
      "rewards/equation_reward_func/std": 0.0,
      "rewards/format_reward_func/mean": 0.25,
      "rewards/format_reward_func/std": 0.4355513006448746,
      "step": 80
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 746.0,
      "completions/max_terminated_length": 345.5,
      "completions/mean_length": 174.6875,
      "completions/mean_terminated_length": 116.39286041259766,
      "completions/min_length": 50.5,
      "completions/min_terminated_length": 50.5,
      "epoch": 0.0001858032705907411,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 13.687472343444824,
      "kl": 0.012647167037357576,
      "learning_rate": 4.5859084235697235e-08,
      "loss": 0.0,
      "num_tokens": 206125.0,
      "reward": 0.25,
      "reward_std": 0.4629100561141968,
      "rewards/equation_reward_func/mean": 0.0,
      "rewards/equation_reward_func/std": 0.0,
      "rewards/format_reward_func/mean": 0.25,
      "rewards/format_reward_func/std": 0.4629100561141968,
      "step": 82
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 240.5,
      "completions/max_terminated_length": 240.5,
      "completions/mean_length": 104.75,
      "completions/mean_terminated_length": 104.75,
      "completions/min_length": 56.0,
      "completions/min_terminated_length": 56.0,
      "epoch": 0.00019033505767832015,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 27.33205795288086,
      "kl": 0.01843659658334218,
      "learning_rate": 3.6945794086007705e-08,
      "loss": 0.0,
      "num_tokens": 210097.0,
      "reward": 0.625,
      "reward_std": 0.49871626496315,
      "rewards/equation_reward_func/mean": 0.0,
      "rewards/equation_reward_func/std": 0.0,
      "rewards/format_reward_func/mean": 0.625,
      "rewards/format_reward_func/std": 0.49871626496315,
      "step": 84
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 516.5,
      "completions/max_terminated_length": 516.5,
      "completions/mean_length": 146.9375,
      "completions/mean_terminated_length": 146.9375,
      "completions/min_length": 45.0,
      "completions/min_terminated_length": 45.0,
      "epoch": 0.00019486684476589921,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 17.247251510620117,
      "kl": 0.014375057930010371,
      "learning_rate": 2.892612731749414e-08,
      "loss": 0.0,
      "num_tokens": 214696.0,
      "reward": 0.375,
      "reward_std": 0.5175491571426392,
      "rewards/equation_reward_func/mean": 0.0,
      "rewards/equation_reward_func/std": 0.0,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.5175492167472839,
      "step": 86
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 176.0,
      "completions/max_terminated_length": 176.0,
      "completions/mean_length": 92.375,
      "completions/mean_terminated_length": 92.375,
      "completions/min_length": 39.0,
      "completions/min_terminated_length": 39.0,
      "epoch": 0.00019939863185347825,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 12.001086235046387,
      "kl": 0.015669465501559898,
      "learning_rate": 2.183372119961499e-08,
      "loss": 0.0,
      "num_tokens": 218470.0,
      "reward": 0.1875,
      "reward_std": 0.408231720328331,
      "rewards/equation_reward_func/mean": 0.0,
      "rewards/equation_reward_func/std": 0.0,
      "rewards/format_reward_func/mean": 0.1875,
      "rewards/format_reward_func/std": 0.408231720328331,
      "step": 88
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 522.0,
      "completions/max_terminated_length": 522.0,
      "completions/mean_length": 174.0,
      "completions/mean_terminated_length": 174.0,
      "completions/min_length": 55.5,
      "completions/min_terminated_length": 55.5,
      "epoch": 0.0002039304189410573,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 11.844170570373535,
      "kl": 0.009546459768898785,
      "learning_rate": 1.5698323748414122e-08,
      "loss": 0.0,
      "num_tokens": 223534.0,
      "reward": 0.375,
      "reward_std": 0.4355512708425522,
      "rewards/equation_reward_func/mean": 0.0,
      "rewards/equation_reward_func/std": 0.0,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.4355513006448746,
      "step": 90
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 331.5,
      "completions/max_terminated_length": 331.5,
      "completions/mean_length": 151.8125,
      "completions/mean_terminated_length": 151.8125,
      "completions/min_length": 48.5,
      "completions/min_terminated_length": 48.5,
      "epoch": 0.00020846220602863637,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 11.816337585449219,
      "kl": 0.01593888070783578,
      "learning_rate": 1.054566895300324e-08,
      "loss": 0.0,
      "num_tokens": 228243.0,
      "reward": 0.5625,
      "reward_std": 0.5260358154773712,
      "rewards/equation_reward_func/mean": 0.0,
      "rewards/equation_reward_func/std": 0.0,
      "rewards/format_reward_func/mean": 0.5625,
      "rewards/format_reward_func/std": 0.5260358452796936,
      "step": 92
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 387.0,
      "completions/max_terminated_length": 387.0,
      "completions/mean_length": 163.6875,
      "completions/mean_terminated_length": 163.6875,
      "completions/min_length": 33.0,
      "completions/min_terminated_length": 33.0,
      "epoch": 0.0002129939931162154,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 11.730420112609863,
      "kl": 0.01412112163961865,
      "learning_rate": 6.397368838268496e-09,
      "loss": 0.0,
      "num_tokens": 233110.0,
      "reward": 0.1875,
      "reward_std": 0.408231720328331,
      "rewards/equation_reward_func/mean": 0.0,
      "rewards/equation_reward_func/std": 0.0,
      "rewards/format_reward_func/mean": 0.1875,
      "rewards/format_reward_func/std": 0.408231720328331,
      "step": 94
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 420.0,
      "completions/max_terminated_length": 420.0,
      "completions/mean_length": 192.4375,
      "completions/mean_terminated_length": 192.4375,
      "completions/min_length": 50.0,
      "completions/min_terminated_length": 50.0,
      "epoch": 0.00021752578020379446,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 15.307268142700195,
      "kl": 0.00981484999647364,
      "learning_rate": 3.2708228165273244e-09,
      "loss": 0.0,
      "num_tokens": 238493.0,
      "reward": 0.0625,
      "reward_std": 0.1767766922712326,
      "rewards/equation_reward_func/mean": 0.0,
      "rewards/equation_reward_func/std": 0.0,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "step": 96
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 372.5,
      "completions/max_terminated_length": 372.5,
      "completions/mean_length": 108.1875,
      "completions/mean_terminated_length": 108.1875,
      "completions/min_length": 37.0,
      "completions/min_terminated_length": 37.0,
      "epoch": 0.00022205756729137352,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 23.4942684173584,
      "kl": 0.022864260390633717,
      "learning_rate": 1.1791447083465133e-09,
      "loss": 0.0,
      "num_tokens": 242536.0,
      "reward": 0.25,
      "reward_std": 0.4629100561141968,
      "rewards/equation_reward_func/mean": 0.0,
      "rewards/equation_reward_func/std": 0.0,
      "rewards/format_reward_func/mean": 0.25,
      "rewards/format_reward_func/std": 0.4629100561141968,
      "step": 98
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 426.0,
      "completions/max_terminated_length": 426.0,
      "completions/mean_length": 194.3125,
      "completions/mean_terminated_length": 194.3125,
      "completions/min_length": 37.0,
      "completions/min_terminated_length": 37.0,
      "epoch": 0.00022658935437895256,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 23.61033058166504,
      "kl": 0.0231838297622744,
      "learning_rate": 1.3110773862126667e-10,
      "loss": 0.0,
      "num_tokens": 247917.0,
      "reward": 0.4375,
      "reward_std": 0.5260358154773712,
      "rewards/equation_reward_func/mean": 0.0,
      "rewards/equation_reward_func/std": 0.0,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.5260358452796936,
      "step": 100
    },
    {
      "epoch": 0.00022658935437895256,
      "step": 100,
      "total_flos": 0.0,
      "train_loss": 8.04626297735922e-06,
      "train_runtime": 2640.6632,
      "train_samples_per_second": 0.303,
      "train_steps_per_second": 0.038
    }
  ],
  "logging_steps": 2,
  "max_steps": 100,
  "num_input_tokens_seen": 247917,
  "num_train_epochs": 1,
  "save_steps": 25,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 1,
  "trial_name": null,
  "trial_params": null
}