{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 3.3865546218487395,
  "eval_steps": 500,
  "global_step": 200,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio": 0.0,
      "completion_length": 635.7109375,
      "epoch": 0.01680672268907563,
      "grad_norm": 0.31708475947380066,
      "kl": 0.0,
      "learning_rate": 1e-06,
      "loss": 0.0099,
      "num_tokens": 143267.0,
      "reward": 0.039062500349245965,
      "reward_std": 0.0698821279220283,
      "rewards/curriculum_aware_reward_fn": 0.023437500349245965,
      "rewards/format_reward": 0.015625,
      "step": 1
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 527.6328125,
      "epoch": 0.03361344537815126,
      "grad_norm": 0.43825313448905945,
      "kl": 0.0002913475036621094,
      "learning_rate": 1e-06,
      "loss": 0.0432,
      "num_tokens": 270812.0,
      "reward": 0.09292763145640492,
      "reward_std": 0.12866380205377936,
      "rewards/curriculum_aware_reward_fn": 0.06949013145640492,
      "rewards/format_reward": 0.0234375,
      "step": 2
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 608.9921875,
      "epoch": 0.05042016806722689,
      "grad_norm": 0.4227641224861145,
      "kl": 0.0002665519714355469,
      "learning_rate": 1e-06,
      "loss": -0.0273,
      "num_tokens": 410971.0,
      "reward": 0.059621710795909166,
      "reward_std": 0.07889116508886218,
      "rewards/curriculum_aware_reward_fn": 0.059621710795909166,
      "rewards/format_reward": 0.0,
      "step": 3
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 558.921875,
      "epoch": 0.06722689075630252,
      "grad_norm": 0.4796917140483856,
      "kl": 0.0002789497375488281,
      "learning_rate": 1e-06,
      "loss": -0.0009,
      "num_tokens": 542313.0,
      "reward": 0.08552631549537182,
      "reward_std": 0.12651031091809273,
      "rewards/curriculum_aware_reward_fn": 0.06990131689235568,
      "rewards/format_reward": 0.015625,
      "step": 4
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 590.265625,
      "epoch": 0.08403361344537816,
      "grad_norm": 0.5620821118354797,
      "kl": 0.0003027915954589844,
      "learning_rate": 1e-06,
      "loss": 0.0288,
      "num_tokens": 677075.0,
      "reward": 0.14925987273454666,
      "reward_std": 0.24606542102992535,
      "rewards/curriculum_aware_reward_fn": 0.09457236900925636,
      "rewards/format_reward": 0.0546875,
      "step": 5
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 592.5234375,
      "epoch": 0.10084033613445378,
      "grad_norm": 0.4298699200153351,
      "kl": 0.0003509521484375,
      "learning_rate": 1e-06,
      "loss": -0.0151,
      "num_tokens": 812710.0,
      "reward": 0.08840460644569248,
      "reward_std": 0.1141207623295486,
      "rewards/curriculum_aware_reward_fn": 0.03371710644569248,
      "rewards/format_reward": 0.0546875,
      "step": 6
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 582.046875,
      "epoch": 0.11764705882352941,
      "grad_norm": 0.526942253112793,
      "kl": 0.0004343986511230469,
      "learning_rate": 1e-06,
      "loss": 0.0192,
      "num_tokens": 943268.0,
      "reward": 0.12088815867900848,
      "reward_std": 0.17540471255779266,
      "rewards/curriculum_aware_reward_fn": 0.07401315867900848,
      "rewards/format_reward": 0.046875,
      "step": 7
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 534.75,
      "epoch": 0.13445378151260504,
      "grad_norm": 0.44275274872779846,
      "kl": 0.0003724098205566406,
      "learning_rate": 1e-06,
      "loss": -0.0033,
      "num_tokens": 1074300.0,
      "reward": 0.030016446253284812,
      "reward_std": 0.08489933330565691,
      "rewards/curriculum_aware_reward_fn": 0.014391447650268674,
      "rewards/format_reward": 0.015625,
      "step": 8
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 539.09375,
      "epoch": 0.15126050420168066,
      "grad_norm": 0.5494865775108337,
      "kl": 0.0007390975952148438,
      "learning_rate": 1e-06,
      "loss": 0.0036,
      "num_tokens": 1197896.0,
      "reward": 0.16570723708719015,
      "reward_std": 0.21696669608354568,
      "rewards/curriculum_aware_reward_fn": 0.05633223685435951,
      "rewards/format_reward": 0.109375,
      "step": 9
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 593.7734375,
      "epoch": 0.16806722689075632,
      "grad_norm": 0.5171737670898438,
      "kl": 0.0006322860717773438,
      "learning_rate": 1e-06,
      "loss": 0.0193,
      "num_tokens": 1336931.0,
      "reward": 0.11143092066049576,
      "reward_std": 0.19064411148428917,
      "rewards/curriculum_aware_reward_fn": 0.017680921009741724,
      "rewards/format_reward": 0.09375,
      "step": 10
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 578.4765625,
      "epoch": 0.18487394957983194,
      "grad_norm": 0.6088258028030396,
      "kl": 0.001346588134765625,
      "learning_rate": 1e-06,
      "loss": 0.037,
      "num_tokens": 1467592.0,
      "reward": 0.22944078594446182,
      "reward_std": 0.3224767856299877,
      "rewards/curriculum_aware_reward_fn": 0.04194079013541341,
      "rewards/format_reward": 0.1875,
      "step": 11
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 601.171875,
      "epoch": 0.20168067226890757,
      "grad_norm": 0.4451327621936798,
      "kl": 0.0010366439819335938,
      "learning_rate": 1e-06,
      "loss": 0.0148,
      "num_tokens": 1607894.0,
      "reward": 0.1204769799951464,
      "reward_std": 0.1381341191008687,
      "rewards/curriculum_aware_reward_fn": 0.018914473825134337,
      "rewards/format_reward": 0.1015625,
      "step": 12
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 526.28125,
      "epoch": 0.2184873949579832,
      "grad_norm": 0.636314332485199,
      "kl": 0.00191497802734375,
      "learning_rate": 1e-06,
      "loss": 0.0125,
      "num_tokens": 1735650.0,
      "reward": 0.26644736528396606,
      "reward_std": 0.30141641572117805,
      "rewards/curriculum_aware_reward_fn": 0.03988486935850233,
      "rewards/format_reward": 0.2265625,
      "step": 13
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 507.515625,
      "epoch": 0.23529411764705882,
      "grad_norm": 0.6864922642707825,
      "kl": 0.004413604736328125,
      "learning_rate": 1e-06,
      "loss": 0.0802,
      "num_tokens": 1856316.0,
      "reward": 0.3112664446234703,
      "reward_std": 0.31644799932837486,
      "rewards/curriculum_aware_reward_fn": 0.05345394683536142,
      "rewards/format_reward": 0.2578125,
      "step": 14
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 554.5859375,
      "epoch": 0.25210084033613445,
      "grad_norm": 0.6268811225891113,
      "kl": 0.0036067962646484375,
      "learning_rate": 1e-06,
      "loss": -0.0044,
      "num_tokens": 1987511.0,
      "reward": 0.4337993338704109,
      "reward_std": 0.32329631969332695,
      "rewards/curriculum_aware_reward_fn": 0.050986841320991516,
      "rewards/format_reward": 0.3828125,
      "step": 15
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 584.3828125,
      "epoch": 0.2689075630252101,
      "grad_norm": 0.5531853437423706,
      "kl": 0.003597259521484375,
      "learning_rate": 1e-06,
      "loss": 0.0104,
      "num_tokens": 2119768.0,
      "reward": 0.3828125037252903,
      "reward_std": 0.26145630702376366,
      "rewards/curriculum_aware_reward_fn": 0.0546875,
      "rewards/format_reward": 0.328125,
      "step": 16
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 481.8046875,
      "epoch": 0.2857142857142857,
      "grad_norm": 0.6449251174926758,
      "kl": 0.005481719970703125,
      "learning_rate": 1e-06,
      "loss": -0.0094,
      "num_tokens": 2238911.0,
      "reward": 0.4543585404753685,
      "reward_std": 0.26075971499085426,
      "rewards/curriculum_aware_reward_fn": 0.05592105304822326,
      "rewards/format_reward": 0.3984375,
      "step": 17
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 645.75,
      "epoch": 0.3025210084033613,
      "grad_norm": 0.37918156385421753,
      "kl": 0.001049041748046875,
      "learning_rate": 1e-06,
      "loss": 0.0055,
      "num_tokens": 2385767.0,
      "reward": 0.1451480264076963,
      "reward_std": 0.1290158643387258,
      "rewards/curriculum_aware_reward_fn": 0.04358552640769631,
      "rewards/format_reward": 0.1015625,
      "step": 18
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 617.6953125,
      "epoch": 0.31932773109243695,
      "grad_norm": 0.39814478158950806,
      "kl": 0.00528717041015625,
      "learning_rate": 1e-06,
      "loss": 0.0518,
      "num_tokens": 2525656.0,
      "reward": 0.35115131735801697,
      "reward_std": 0.11648409254848957,
      "rewards/curriculum_aware_reward_fn": 0.02302631549537182,
      "rewards/format_reward": 0.328125,
      "step": 19
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 459.6015625,
      "epoch": 0.33613445378151263,
      "grad_norm": 0.7307525873184204,
      "kl": 0.005184173583984375,
      "learning_rate": 1e-06,
      "loss": 0.083,
      "num_tokens": 2644077.0,
      "reward": 0.47574012726545334,
      "reward_std": 0.2815094441175461,
      "rewards/curriculum_aware_reward_fn": 0.04605263099074364,
      "rewards/format_reward": 0.4296875,
      "step": 20
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 620.46875,
      "epoch": 0.35294117647058826,
      "grad_norm": 0.46509799361228943,
      "kl": 0.0036363601684570312,
      "learning_rate": 1e-06,
      "loss": 0.0145,
      "num_tokens": 2786169.0,
      "reward": 0.24177631677594036,
      "reward_std": 0.09853590792044997,
      "rewards/curriculum_aware_reward_fn": 0.023026315728202462,
      "rewards/format_reward": 0.21875,
      "step": 21
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 578.9609375,
      "epoch": 0.3697478991596639,
      "grad_norm": 0.5765166878700256,
      "kl": 0.005565643310546875,
      "learning_rate": 1e-06,
      "loss": 0.0042,
      "num_tokens": 2917180.0,
      "reward": 0.4958881437778473,
      "reward_std": 0.10692231869325042,
      "rewards/curriculum_aware_reward_fn": 0.0740131582133472,
      "rewards/format_reward": 0.421875,
      "step": 22
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 579.578125,
      "epoch": 0.3865546218487395,
      "grad_norm": 0.5340356826782227,
      "kl": 0.00540924072265625,
      "learning_rate": 1e-06,
      "loss": -0.0083,
      "num_tokens": 3053414.0,
      "reward": 0.3708881437778473,
      "reward_std": 0.11791826784610748,
      "rewards/curriculum_aware_reward_fn": 0.06620065867900848,
      "rewards/format_reward": 0.3046875,
      "step": 23
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 507.5625,
      "epoch": 0.40336134453781514,
      "grad_norm": 0.4752294719219208,
      "kl": 0.031703948974609375,
      "learning_rate": 1e-06,
      "loss": -0.0004,
      "num_tokens": 3181894.0,
      "reward": 0.3700657896697521,
      "reward_std": 0.1367718242108822,
      "rewards/curriculum_aware_reward_fn": 0.002878289553336799,
      "rewards/format_reward": 0.3671875,
      "step": 24
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 496.4296875,
      "epoch": 0.42016806722689076,
      "grad_norm": 0.46164318919181824,
      "kl": 0.0082855224609375,
      "learning_rate": 1e-06,
      "loss": -0.0091,
      "num_tokens": 3304077.0,
      "reward": 0.5016447380185127,
      "reward_std": 0.09064025245606899,
      "rewards/curriculum_aware_reward_fn": 0.017269736621528864,
      "rewards/format_reward": 0.484375,
      "step": 25
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 454.5859375,
      "epoch": 0.4369747899159664,
      "grad_norm": 0.5706049799919128,
      "kl": 0.01887798309326172,
      "learning_rate": 1e-06,
      "loss": -0.0096,
      "num_tokens": 3420488.0,
      "reward": 0.6875,
      "reward_std": 0.12697386741638184,
      "rewards/curriculum_aware_reward_fn": 0.0234375,
      "rewards/format_reward": 0.6640625,
      "step": 26
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 554.46875,
      "epoch": 0.453781512605042,
      "grad_norm": 0.45473384857177734,
      "kl": 0.0068416595458984375,
      "learning_rate": 1e-06,
      "loss": 0.0074,
      "num_tokens": 3552340.0,
      "reward": 0.34868420753628016,
      "reward_std": 0.10102300066500902,
      "rewards/curriculum_aware_reward_fn": 0.012746710679493845,
      "rewards/format_reward": 0.3359375,
      "step": 27
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 534.0,
      "epoch": 0.47058823529411764,
      "grad_norm": 0.348452091217041,
      "kl": 0.01036834716796875,
      "learning_rate": 1e-06,
      "loss": 0.0145,
      "num_tokens": 3677892.0,
      "reward": 0.5571546033024788,
      "reward_std": 0.055680982768535614,
      "rewards/curriculum_aware_reward_fn": 0.010279605048708618,
      "rewards/format_reward": 0.546875,
      "step": 28
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 584.03125,
      "epoch": 0.48739495798319327,
      "grad_norm": 0.452033668756485,
      "kl": 0.0071258544921875,
      "learning_rate": 1e-06,
      "loss": 0.0115,
      "num_tokens": 3813600.0,
      "reward": 0.3984375,
      "reward_std": 0.08443661965429783,
      "rewards/curriculum_aware_reward_fn": 0.046875,
      "rewards/format_reward": 0.3515625,
      "step": 29
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 552.4921875,
      "epoch": 0.5042016806722689,
      "grad_norm": 0.4926210641860962,
      "kl": 0.005392551422119141,
      "learning_rate": 1e-06,
      "loss": 0.0285,
      "num_tokens": 3947807.0,
      "reward": 0.4683388201519847,
      "reward_std": 0.11112732999026775,
      "rewards/curriculum_aware_reward_fn": 0.03865131549537182,
      "rewards/format_reward": 0.4296875,
      "step": 30
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 559.6953125,
      "epoch": 0.5210084033613446,
      "grad_norm": 0.5463467240333557,
      "kl": 0.004418373107910156,
      "learning_rate": 1e-06,
      "loss": -0.0233,
      "num_tokens": 4080704.0,
      "reward": 0.22203946067020297,
      "reward_std": 0.09257729165256023,
      "rewards/curriculum_aware_reward_fn": 0.042351973708719015,
      "rewards/format_reward": 0.1796875,
      "step": 31
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 549.921875,
      "epoch": 0.5378151260504201,
      "grad_norm": 0.36463335156440735,
      "kl": 0.006511688232421875,
      "learning_rate": 1e-06,
      "loss": -0.0032,
      "num_tokens": 4214870.0,
      "reward": 0.4346217215061188,
      "reward_std": 0.03605314111337066,
      "rewards/curriculum_aware_reward_fn": 0.004934210563078523,
      "rewards/format_reward": 0.4296875,
      "step": 32
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 478.7890625,
      "epoch": 0.5546218487394958,
      "grad_norm": 0.5116223692893982,
      "kl": 0.008532524108886719,
      "learning_rate": 1e-06,
      "loss": -0.0153,
      "num_tokens": 4338203.0,
      "reward": 0.4560032896697521,
      "reward_std": 0.12314211018383503,
      "rewards/curriculum_aware_reward_fn": 0.08881578966975212,
      "rewards/format_reward": 0.3671875,
      "step": 33
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 476.7890625,
      "epoch": 0.5714285714285714,
      "grad_norm": 0.43187472224235535,
      "kl": 0.007843017578125,
      "learning_rate": 1e-06,
      "loss": 0.0134,
      "num_tokens": 4461184.0,
      "reward": 0.4333881586790085,
      "reward_std": 0.12357822060585022,
      "rewards/curriculum_aware_reward_fn": 0.02713815774768591,
      "rewards/format_reward": 0.40625,
      "step": 34
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 529.7578125,
      "epoch": 0.5882352941176471,
      "grad_norm": 0.4466142952442169,
      "kl": 0.0057315826416015625,
      "learning_rate": 1e-06,
      "loss": -0.015,
      "num_tokens": 4590329.0,
      "reward": 0.426809199154377,
      "reward_std": 0.10671343095600605,
      "rewards/curriculum_aware_reward_fn": 0.059621710097417235,
      "rewards/format_reward": 0.3671875,
      "step": 35
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 521.9453125,
      "epoch": 0.6050420168067226,
      "grad_norm": 0.5088793635368347,
      "kl": 0.00739288330078125,
      "learning_rate": 1e-06,
      "loss": 0.0193,
      "num_tokens": 4717658.0,
      "reward": 0.5740131624042988,
      "reward_std": 0.09916227497160435,
      "rewards/curriculum_aware_reward_fn": 0.08182565809693187,
      "rewards/format_reward": 0.4921875,
      "step": 36
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 481.59375,
      "epoch": 0.6218487394957983,
      "grad_norm": 0.3755647540092468,
      "kl": 0.005794525146484375,
      "learning_rate": 1e-06,
      "loss": 0.0106,
      "num_tokens": 4837174.0,
      "reward": 0.5123355314135551,
      "reward_std": 0.023199534974992275,
      "rewards/curriculum_aware_reward_fn": 0.0748355258256197,
      "rewards/format_reward": 0.4375,
      "step": 37
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 465.1953125,
      "epoch": 0.6386554621848739,
      "grad_norm": 0.5442925691604614,
      "kl": 0.008731842041015625,
      "learning_rate": 1e-06,
      "loss": -0.0111,
      "num_tokens": 4953039.0,
      "reward": 0.7232730239629745,
      "reward_std": 0.1315580508671701,
      "rewards/curriculum_aware_reward_fn": 0.16077302338089794,
      "rewards/format_reward": 0.5625,
      "step": 38
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 501.484375,
      "epoch": 0.6554621848739496,
      "grad_norm": 0.4446295201778412,
      "kl": 0.00624847412109375,
      "learning_rate": 1e-06,
      "loss": 0.0286,
      "num_tokens": 5076965.0,
      "reward": 0.47327301651239395,
      "reward_std": 0.08440816402435303,
      "rewards/curriculum_aware_reward_fn": 0.09827302861958742,
      "rewards/format_reward": 0.375,
      "step": 39
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 539.15625,
      "epoch": 0.6722689075630253,
      "grad_norm": 0.37400856614112854,
      "kl": 0.005260467529296875,
      "learning_rate": 1e-06,
      "loss": 0.0044,
      "num_tokens": 5207185.0,
      "reward": 0.4745065679308027,
      "reward_std": 0.07072597183287144,
      "rewards/curriculum_aware_reward_fn": 0.09950657887384295,
      "rewards/format_reward": 0.375,
      "step": 40
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 492.0859375,
      "epoch": 0.6890756302521008,
      "grad_norm": 0.4103780686855316,
      "kl": 0.00856781005859375,
      "learning_rate": 1e-06,
      "loss": 0.0049,
      "num_tokens": 5328012.0,
      "reward": 0.71875,
      "reward_std": 0.10247145313769579,
      "rewards/curriculum_aware_reward_fn": 0.09375000419095159,
      "rewards/format_reward": 0.625,
      "step": 41
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 405.1328125,
      "epoch": 0.7058823529411765,
      "grad_norm": 0.6738374829292297,
      "kl": 0.0108184814453125,
      "learning_rate": 1e-06,
      "loss": 0.0454,
      "num_tokens": 5438933.0,
      "reward": 0.757401317358017,
      "reward_std": 0.212964728474617,
      "rewards/curriculum_aware_reward_fn": 0.1636513164266944,
      "rewards/format_reward": 0.59375,
      "step": 42
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 516.640625,
      "epoch": 0.7226890756302521,
      "grad_norm": 0.31194940209388733,
      "kl": 0.0074005126953125,
      "learning_rate": 1e-06,
      "loss": -0.0205,
      "num_tokens": 5563887.0,
      "reward": 0.6562500149011612,
      "reward_std": 0.04224720690399408,
      "rewards/curriculum_aware_reward_fn": 0.15625,
      "rewards/format_reward": 0.5,
      "step": 43
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 477.3984375,
      "epoch": 0.7394957983193278,
      "grad_norm": 0.38581541180610657,
      "kl": 0.00885009765625,
      "learning_rate": 1e-06,
      "loss": -0.0164,
      "num_tokens": 5688114.0,
      "reward": 0.6402138248085976,
      "reward_std": 0.08311590366065502,
      "rewards/curriculum_aware_reward_fn": 0.03083881549537182,
      "rewards/format_reward": 0.609375,
      "step": 44
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 521.8046875,
      "epoch": 0.7563025210084033,
      "grad_norm": 0.36903509497642517,
      "kl": 0.0078277587890625,
      "learning_rate": 1e-06,
      "loss": -0.0022,
      "num_tokens": 5814153.0,
      "reward": 0.5513980239629745,
      "reward_std": 0.06967925047501922,
      "rewards/curriculum_aware_reward_fn": 0.05139802524354309,
      "rewards/format_reward": 0.5,
      "step": 45
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 416.4296875,
      "epoch": 0.773109243697479,
      "grad_norm": 0.5821658968925476,
      "kl": 0.0094757080078125,
      "learning_rate": 1e-06,
      "loss": -0.0056,
      "num_tokens": 5923904.0,
      "reward": 0.7257401347160339,
      "reward_std": 0.13419464463368058,
      "rewards/curriculum_aware_reward_fn": 0.10074013192206621,
      "rewards/format_reward": 0.625,
      "step": 46
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 526.9296875,
      "epoch": 0.7899159663865546,
      "grad_norm": 0.449553519487381,
      "kl": 0.005664825439453125,
      "learning_rate": 1e-06,
      "loss": 0.0001,
      "num_tokens": 6053447.0,
      "reward": 0.4819078892469406,
      "reward_std": 0.09099963493645191,
      "rewards/curriculum_aware_reward_fn": 0.10690789669752121,
      "rewards/format_reward": 0.375,
      "step": 47
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 536.5,
      "epoch": 0.8067226890756303,
      "grad_norm": 0.5381475687026978,
      "kl": 0.008424758911132812,
      "learning_rate": 1e-06,
      "loss": 0.0099,
      "num_tokens": 6183559.0,
      "reward": 0.46833881735801697,
      "reward_std": 0.08668615715578198,
      "rewards/curriculum_aware_reward_fn": 0.03865131642669439,
      "rewards/format_reward": 0.4296875,
      "step": 48
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 539.6328125,
      "epoch": 0.8235294117647058,
      "grad_norm": 0.44155657291412354,
      "kl": 0.0077495574951171875,
      "learning_rate": 1e-06,
      "loss": 0.0085,
      "num_tokens": 6314544.0,
      "reward": 0.5526315867900848,
      "reward_std": 0.027912108227610588,
      "rewards/curriculum_aware_reward_fn": 0.11513157933950424,
      "rewards/format_reward": 0.4375,
      "step": 49
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 554.0546875,
      "epoch": 0.8403361344537815,
      "grad_norm": 0.4840262532234192,
      "kl": 0.0054950714111328125,
      "learning_rate": 1e-06,
      "loss": 0.0073,
      "num_tokens": 6445087.0,
      "reward": 0.33634869009256363,
      "reward_std": 0.10334387933835387,
      "rewards/curriculum_aware_reward_fn": 0.03166118450462818,
      "rewards/format_reward": 0.3046875,
      "step": 50
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 578.6796875,
      "epoch": 0.8571428571428571,
      "grad_norm": 0.30791598558425903,
      "kl": 0.005002021789550781,
      "learning_rate": 1e-06,
      "loss": 0.0068,
      "num_tokens": 6582878.0,
      "reward": 0.348684199154377,
      "reward_std": 0.07469352334737778,
      "rewards/curriculum_aware_reward_fn": 0.036184209398925304,
      "rewards/format_reward": 0.3125,
      "step": 51
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 448.1328125,
      "epoch": 0.8739495798319328,
      "grad_norm": 0.5027822852134705,
      "kl": 0.00795745849609375,
      "learning_rate": 1e-06,
      "loss": -0.0178,
      "num_tokens": 6698503.0,
      "reward": 0.6311677470803261,
      "reward_std": 0.11679959110915661,
      "rewards/curriculum_aware_reward_fn": 0.09210526384413242,
      "rewards/format_reward": 0.5390625,
      "step": 52
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 521.1875,
      "epoch": 0.8907563025210085,
      "grad_norm": 0.4084753394126892,
      "kl": 0.00714111328125,
      "learning_rate": 1e-06,
      "loss": 0.0193,
      "num_tokens": 6823951.0,
      "reward": 0.5028782784938812,
      "reward_std": 0.059696739073842764,
      "rewards/curriculum_aware_reward_fn": 0.06537829001899809,
      "rewards/format_reward": 0.4375,
      "step": 53
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 539.109375,
      "epoch": 0.907563025210084,
      "grad_norm": 0.2098054140806198,
      "kl": 0.007198333740234375,
      "learning_rate": 1e-06,
      "loss": 0.0114,
      "num_tokens": 6953317.0,
      "reward": 0.46052631735801697,
      "reward_std": 0.03168220818042755,
      "rewards/curriculum_aware_reward_fn": 0.023026317358016968,
      "rewards/format_reward": 0.4375,
      "step": 54
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 521.2265625,
      "epoch": 0.9243697478991597,
      "grad_norm": 0.4919142425060272,
      "kl": 0.007293701171875,
      "learning_rate": 1e-06,
      "loss": 0.0101,
      "num_tokens": 7079922.0,
      "reward": 0.49547697603702545,
      "reward_std": 0.10914274398237467,
      "rewards/curriculum_aware_reward_fn": 0.12047697883099318,
      "rewards/format_reward": 0.375,
      "step": 55
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 500.5625,
      "epoch": 0.9411764705882353,
      "grad_norm": 0.46875280141830444,
      "kl": 0.00684356689453125,
      "learning_rate": 1e-06,
      "loss": 0.0266,
      "num_tokens": 7206954.0,
      "reward": 0.40830591320991516,
      "reward_std": 0.1075905729085207,
      "rewards/curriculum_aware_reward_fn": 0.04111842066049576,
      "rewards/format_reward": 0.3671875,
      "step": 56
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 482.3046875,
      "epoch": 0.957983193277311,
      "grad_norm": 0.40924757719039917,
      "kl": 0.012725830078125,
      "learning_rate": 1e-06,
      "loss": 0.0041,
      "num_tokens": 7327857.0,
      "reward": 0.5958059132099152,
      "reward_std": 0.06403321353718638,
      "rewards/curriculum_aware_reward_fn": 0.04111842007841915,
      "rewards/format_reward": 0.5546875,
      "step": 57
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 445.375,
      "epoch": 0.9747899159663865,
      "grad_norm": 0.4467240273952484,
      "kl": 0.0105743408203125,
      "learning_rate": 1e-06,
      "loss": -0.0061,
      "num_tokens": 7440561.0,
      "reward": 0.7578125,
      "reward_std": 0.057358515448868275,
      "rewards/curriculum_aware_reward_fn": 0.13281250069849193,
      "rewards/format_reward": 0.625,
      "step": 58
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 582.3452377319336,
      "epoch": 0.9915966386554622,
      "grad_norm": 0.5007306933403015,
      "kl": 0.007415771484375,
      "learning_rate": 1e-06,
      "loss": 0.0029,
      "num_tokens": 7569086.0,
      "reward": 0.4514802545309067,
      "reward_std": 0.06341935088858008,
      "rewards/curriculum_aware_reward_fn": 0.0217927637277171,
      "rewards/format_reward": 0.4296875,
      "step": 59
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 553.09375,
      "epoch": 1.0168067226890756,
      "grad_norm": 0.4292355179786682,
      "kl": 0.005462646484375,
      "learning_rate": 1e-06,
      "loss": 0.0119,
      "num_tokens": 7702626.0,
      "reward": 0.4325658082962036,
      "reward_std": 0.07455102633684874,
      "rewards/curriculum_aware_reward_fn": 0.05756579013541341,
      "rewards/format_reward": 0.375,
      "step": 60
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 521.375,
      "epoch": 1.0336134453781514,
      "grad_norm": 0.41578003764152527,
      "kl": 0.008762359619140625,
      "learning_rate": 1e-06,
      "loss": -0.0056,
      "num_tokens": 7827818.0,
      "reward": 0.5082236900925636,
      "reward_std": 0.07253926200792193,
      "rewards/curriculum_aware_reward_fn": 0.07072368497028947,
      "rewards/format_reward": 0.4375,
      "step": 61
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 648.2734375,
      "epoch": 1.050420168067227,
      "grad_norm": 0.48642197251319885,
      "kl": 0.0062713623046875,
      "learning_rate": 1e-06,
      "loss": 0.0136,
      "num_tokens": 7974333.0,
      "reward": 0.3449835442006588,
      "reward_std": 0.07259867247194052,
      "rewards/curriculum_aware_reward_fn": 0.03248355258256197,
      "rewards/format_reward": 0.3125,
      "step": 62
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 436.9296875,
      "epoch": 1.0672268907563025,
      "grad_norm": 0.3184286653995514,
      "kl": 0.0114593505859375,
      "learning_rate": 1e-06,
      "loss": 0.0139,
      "num_tokens": 8084908.0,
      "reward": 0.6899671256542206,
      "reward_std": 0.0728745711967349,
      "rewards/curriculum_aware_reward_fn": 0.0649671049322933,
      "rewards/format_reward": 0.625,
      "step": 63
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 541.53125,
      "epoch": 1.084033613445378,
      "grad_norm": 0.16483676433563232,
      "kl": 0.0060882568359375,
      "learning_rate": 1e-06,
      "loss": -0.0032,
      "num_tokens": 8216696.0,
      "reward": 0.2627467066049576,
      "reward_std": 0.024391429498791695,
      "rewards/curriculum_aware_reward_fn": 0.012746710330247879,
      "rewards/format_reward": 0.25,
      "step": 64
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 509.7890625,
      "epoch": 1.1008403361344539,
      "grad_norm": 0.4256879985332489,
      "kl": 0.00730133056640625,
      "learning_rate": 1e-06,
      "loss": -0.0102,
      "num_tokens": 8342845.0,
      "reward": 0.5197368264198303,
      "reward_std": 0.030515023041516542,
      "rewards/curriculum_aware_reward_fn": 0.019736842485144734,
      "rewards/format_reward": 0.5,
      "step": 65
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 486.296875,
      "epoch": 1.1176470588235294,
      "grad_norm": 0.3091375231742859,
      "kl": 0.008016586303710938,
      "learning_rate": 1e-06,
      "loss": -0.003,
      "num_tokens": 8462971.0,
      "reward": 0.46299341320991516,
      "reward_std": 0.04847824294120073,
      "rewards/curriculum_aware_reward_fn": 0.025493420660495758,
      "rewards/format_reward": 0.4375,
      "step": 66
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 596.234375,
      "epoch": 1.134453781512605,
      "grad_norm": 0.4554305076599121,
      "kl": 0.006458282470703125,
      "learning_rate": 1e-06,
      "loss": 0.011,
      "num_tokens": 8598337.0,
      "reward": 0.3758223643526435,
      "reward_std": 0.08455474488437176,
      "rewards/curriculum_aware_reward_fn": 0.1258223680779338,
      "rewards/format_reward": 0.25,
      "step": 67
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 444.265625,
      "epoch": 1.1512605042016806,
      "grad_norm": 0.4700126349925995,
      "kl": 0.013336181640625,
      "learning_rate": 1e-06,
      "loss": -0.0065,
      "num_tokens": 8715091.0,
      "reward": 0.67434211820364,
      "reward_std": 0.12386543769389391,
      "rewards/curriculum_aware_reward_fn": 0.11965460598003119,
      "rewards/format_reward": 0.5546875,
      "step": 68
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 537.6484375,
      "epoch": 1.1680672268907564,
      "grad_norm": 0.5387859344482422,
      "kl": 0.0084075927734375,
      "learning_rate": 1e-06,
      "loss": 0.0113,
      "num_tokens": 8845582.0,
      "reward": 0.5822368338704109,
      "reward_std": 0.16140672331675887,
      "rewards/curriculum_aware_reward_fn": 0.10567433899268508,
      "rewards/format_reward": 0.4765625,
      "step": 69
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 571.25,
      "epoch": 1.184873949579832,
      "grad_norm": 0.28276559710502625,
      "kl": 0.005802154541015625,
      "learning_rate": 1e-06,
      "loss": 0.0147,
      "num_tokens": 8979574.0,
      "reward": 0.2606907826848328,
      "reward_std": 0.051840442698448896,
      "rewards/curriculum_aware_reward_fn": 0.018503289436921477,
      "rewards/format_reward": 0.2421875,
      "step": 70
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 495.6953125,
      "epoch": 1.2016806722689075,
      "grad_norm": 0.3467198312282562,
      "kl": 0.007556915283203125,
      "learning_rate": 1e-06,
      "loss": 0.0133,
      "num_tokens": 9104143.0,
      "reward": 0.5476973727345467,
      "reward_std": 0.0878668250516057,
      "rewards/curriculum_aware_reward_fn": 0.04769736947491765,
      "rewards/format_reward": 0.5,
      "step": 71
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 628.96875,
      "epoch": 1.2184873949579833,
      "grad_norm": 0.30438435077667236,
      "kl": 0.0047740936279296875,
      "learning_rate": 1e-06,
      "loss": -0.0058,
      "num_tokens": 9247579.0,
      "reward": 0.25863486528396606,
      "reward_std": 0.05783074861392379,
      "rewards/curriculum_aware_reward_fn": 0.016447368427179754,
      "rewards/format_reward": 0.2421875,
      "step": 72
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 581.7109375,
      "epoch": 1.2352941176470589,
      "grad_norm": 0.16290180385112762,
      "kl": 0.005523681640625,
      "learning_rate": 1e-06,
      "loss": 0.0069,
      "num_tokens": 9383094.0,
      "reward": 0.32195723056793213,
      "reward_std": 0.014439198188483715,
      "rewards/curriculum_aware_reward_fn": 0.009457237087190151,
      "rewards/format_reward": 0.3125,
      "step": 73
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 536.9609375,
      "epoch": 1.2521008403361344,
      "grad_norm": 1.2357046604156494,
      "kl": 0.170867919921875,
      "learning_rate": 1e-06,
      "loss": -0.0054,
      "num_tokens": 9513217.0,
      "reward": 0.582236819434911,
      "reward_std": 0.0510927583090961,
      "rewards/curriculum_aware_reward_fn": 0.01973684225231409,
      "rewards/format_reward": 0.5625,
      "step": 74
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 487.8671875,
      "epoch": 1.26890756302521,
      "grad_norm": 0.46429404616355896,
      "kl": 0.0113677978515625,
      "learning_rate": 1e-06,
      "loss": 0.0446,
      "num_tokens": 9635408.0,
      "reward": 0.726973682641983,
      "reward_std": 0.11705214250832796,
      "rewards/curriculum_aware_reward_fn": 0.10197368450462818,
      "rewards/format_reward": 0.625,
      "step": 75
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 584.296875,
      "epoch": 1.2857142857142856,
      "grad_norm": 0.42755427956581116,
      "kl": 0.00647735595703125,
      "learning_rate": 1e-06,
      "loss": 0.0307,
      "num_tokens": 9770998.0,
      "reward": 0.49136512726545334,
      "reward_std": 0.10772840678691864,
      "rewards/curriculum_aware_reward_fn": 0.06167763099074364,
      "rewards/format_reward": 0.4296875,
      "step": 76
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 429.296875,
      "epoch": 1.3025210084033614,
      "grad_norm": 0.45878008008003235,
      "kl": 0.01023101806640625,
      "learning_rate": 1e-06,
      "loss": 0.0156,
      "num_tokens": 9886868.0,
      "reward": 0.7347861528396606,
      "reward_std": 0.10009488789364696,
      "rewards/curriculum_aware_reward_fn": 0.06291118392255157,
      "rewards/format_reward": 0.671875,
      "step": 77
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 516.6328125,
      "epoch": 1.319327731092437,
      "grad_norm": 0.3113223910331726,
      "kl": 0.0077972412109375,
      "learning_rate": 1e-06,
      "loss": 0.0078,
      "num_tokens": 10011221.0,
      "reward": 0.5966282933950424,
      "reward_std": 0.041548303328454494,
      "rewards/curriculum_aware_reward_fn": 0.09662828780710697,
      "rewards/format_reward": 0.5,
      "step": 78
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 557.4765625,
      "epoch": 1.3361344537815127,
      "grad_norm": 0.33871227502822876,
      "kl": 0.0073699951171875,
      "learning_rate": 1e-06,
      "loss": 0.0191,
      "num_tokens": 10140970.0,
      "reward": 0.5415295958518982,
      "reward_std": 0.07458627689629793,
      "rewards/curriculum_aware_reward_fn": 0.04152960516512394,
      "rewards/format_reward": 0.5,
      "step": 79
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 564.125,
      "epoch": 1.3529411764705883,
      "grad_norm": 0.4491986930370331,
      "kl": 0.006259918212890625,
      "learning_rate": 1e-06,
      "loss": 0.0074,
      "num_tokens": 10271986.0,
      "reward": 0.5296052545309067,
      "reward_std": 0.1359914354979992,
      "rewards/curriculum_aware_reward_fn": 0.09210526291280985,
      "rewards/format_reward": 0.4375,
      "step": 80
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 503.125,
      "epoch": 1.3697478991596639,
      "grad_norm": 0.2838430404663086,
      "kl": 0.00777435302734375,
      "learning_rate": 1e-06,
      "loss": 0.0186,
      "num_tokens": 10398090.0,
      "reward": 0.6521381586790085,
      "reward_std": 0.05697542009875178,
      "rewards/curriculum_aware_reward_fn": 0.027138158679008484,
      "rewards/format_reward": 0.625,
      "step": 81
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 530.234375,
      "epoch": 1.3865546218487395,
      "grad_norm": 0.4765428602695465,
      "kl": 0.00778961181640625,
      "learning_rate": 1e-06,
      "loss": 0.0302,
      "num_tokens": 10526192.0,
      "reward": 0.6208881437778473,
      "reward_std": 0.12499829288572073,
      "rewards/curriculum_aware_reward_fn": 0.06620065588504076,
      "rewards/format_reward": 0.5546875,
      "step": 82
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 566.84375,
      "epoch": 1.403361344537815,
      "grad_norm": 0.4760180711746216,
      "kl": 0.0066986083984375,
      "learning_rate": 1e-06,
      "loss": 0.0084,
      "num_tokens": 10657412.0,
      "reward": 0.46916117519140244,
      "reward_std": 0.10547287575900555,
      "rewards/curriculum_aware_reward_fn": 0.09416118310764432,
      "rewards/format_reward": 0.375,
      "step": 83
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 560.7734375,
      "epoch": 1.4201680672268908,
      "grad_norm": 0.27778276801109314,
      "kl": 0.005718231201171875,
      "learning_rate": 1e-06,
      "loss": 0.0127,
      "num_tokens": 10788255.0,
      "reward": 0.44736841320991516,
      "reward_std": 0.06990169547498226,
      "rewards/curriculum_aware_reward_fn": 0.07236842112615705,
      "rewards/format_reward": 0.375,
      "step": 84
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 484.9296875,
      "epoch": 1.4369747899159664,
      "grad_norm": 0.34481725096702576,
      "kl": 0.02048492431640625,
      "learning_rate": 1e-06,
      "loss": -0.0032,
      "num_tokens": 10911166.0,
      "reward": 0.7388980239629745,
      "reward_std": 0.08143611438572407,
      "rewards/curriculum_aware_reward_fn": 0.1138980237301439,
      "rewards/format_reward": 0.625,
      "step": 85
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 457.71875,
      "epoch": 1.453781512605042,
      "grad_norm": 0.4829816222190857,
      "kl": 0.0100555419921875,
      "learning_rate": 1e-06,
      "loss": -0.0016,
      "num_tokens": 11027554.0,
      "reward": 0.6735197305679321,
      "reward_std": 0.08864451944828033,
      "rewards/curriculum_aware_reward_fn": 0.11101973801851273,
      "rewards/format_reward": 0.5625,
      "step": 86
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 508.6953125,
      "epoch": 1.4705882352941178,
      "grad_norm": 0.5016542077064514,
      "kl": 0.00922393798828125,
      "learning_rate": 1e-06,
      "loss": 0.01,
      "num_tokens": 11149275.0,
      "reward": 0.6870888322591782,
      "reward_std": 0.08495050063356757,
      "rewards/curriculum_aware_reward_fn": 0.12458881677594036,
      "rewards/format_reward": 0.5625,
      "step": 87
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 602.9921875,
      "epoch": 1.4873949579831933,
      "grad_norm": 0.29301658272743225,
      "kl": 0.004894256591796875,
      "learning_rate": 1e-06,
      "loss": 0.0249,
      "num_tokens": 11288106.0,
      "reward": 0.29481907188892365,
      "reward_std": 0.0620402698405087,
      "rewards/curriculum_aware_reward_fn": 0.04481907875742763,
      "rewards/format_reward": 0.25,
      "step": 88
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 438.71875,
      "epoch": 1.504201680672269,
      "grad_norm": 0.5715950727462769,
      "kl": 0.01503753662109375,
      "learning_rate": 1e-06,
      "loss": -0.0041,
      "num_tokens": 11401118.0,
      "reward": 0.8972039222717285,
      "reward_std": 0.10221139155328274,
      "rewards/curriculum_aware_reward_fn": 0.1472039446234703,
      "rewards/format_reward": 0.75,
      "step": 89
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 547.015625,
      "epoch": 1.5210084033613445,
      "grad_norm": 0.31229323148727417,
      "kl": 0.0074462890625,
      "learning_rate": 1e-06,
      "loss": 0.0001,
      "num_tokens": 11531400.0,
      "reward": 0.5945723652839661,
      "reward_std": 0.05676991865038872,
      "rewards/curriculum_aware_reward_fn": 0.15707236900925636,
      "rewards/format_reward": 0.4375,
      "step": 90
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 599.9375,
      "epoch": 1.53781512605042,
      "grad_norm": 0.3754754066467285,
      "kl": 0.005001068115234375,
      "learning_rate": 1e-06,
      "loss": 0.0337,
      "num_tokens": 11667224.0,
      "reward": 0.4358552396297455,
      "reward_std": 0.1078398427926004,
      "rewards/curriculum_aware_reward_fn": 0.060855261399410665,
      "rewards/format_reward": 0.375,
      "step": 91
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 563.3671875,
      "epoch": 1.5546218487394958,
      "grad_norm": 0.44682905077934265,
      "kl": 0.00695037841796875,
      "learning_rate": 1e-06,
      "loss": 0.0261,
      "num_tokens": 11800087.0,
      "reward": 0.47820721566677094,
      "reward_std": 0.11488656094297767,
      "rewards/curriculum_aware_reward_fn": 0.10320723743643612,
      "rewards/format_reward": 0.375,
      "step": 92
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 524.7578125,
      "epoch": 1.5714285714285714,
      "grad_norm": 0.4093223214149475,
      "kl": 0.0079803466796875,
      "learning_rate": 1e-06,
      "loss": -0.0003,
      "num_tokens": 11927808.0,
      "reward": 0.5555098727345467,
      "reward_std": 0.10677139926701784,
      "rewards/curriculum_aware_reward_fn": 0.06332236900925636,
      "rewards/format_reward": 0.4921875,
      "step": 93
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 612.375,
      "epoch": 1.5882352941176472,
      "grad_norm": 0.28754857182502747,
      "kl": 0.004489898681640625,
      "learning_rate": 1e-06,
      "loss": -0.0277,
      "num_tokens": 12069560.0,
      "reward": 0.3371710553765297,
      "reward_std": 0.050214093178510666,
      "rewards/curriculum_aware_reward_fn": 0.02467105258256197,
      "rewards/format_reward": 0.3125,
      "step": 94
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 383.2578125,
      "epoch": 1.6050420168067228,
      "grad_norm": 0.47502318024635315,
      "kl": 0.0126953125,
      "learning_rate": 1e-06,
      "loss": 0.016,
      "num_tokens": 12176625.0,
      "reward": 0.7224506512284279,
      "reward_std": 0.10677911480888724,
      "rewards/curriculum_aware_reward_fn": 0.10526315728202462,
      "rewards/format_reward": 0.6171875,
      "step": 95
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 578.078125,
      "epoch": 1.6218487394957983,
      "grad_norm": 0.34693828225135803,
      "kl": 0.006988525390625,
      "learning_rate": 1e-06,
      "loss": 0.006,
      "num_tokens": 12310939.0,
      "reward": 0.5254934206604958,
      "reward_std": 0.06210480257868767,
      "rewards/curriculum_aware_reward_fn": 0.08799342112615705,
      "rewards/format_reward": 0.4375,
      "step": 96
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 529.828125,
      "epoch": 1.638655462184874,
      "grad_norm": 2.9580295085906982,
      "kl": 0.21123504638671875,
      "learning_rate": 1e-06,
      "loss": 0.0019,
      "num_tokens": 12436949.0,
      "reward": 0.5230263099074364,
      "reward_std": 0.13364601507782936,
      "rewards/curriculum_aware_reward_fn": 0.11677631549537182,
      "rewards/format_reward": 0.40625,
      "step": 97
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 470.8828125,
      "epoch": 1.6554621848739495,
      "grad_norm": 0.39620673656463623,
      "kl": 0.00954437255859375,
      "learning_rate": 1e-06,
      "loss": -0.0048,
      "num_tokens": 12558190.0,
      "reward": 0.8194901347160339,
      "reward_std": 0.09049705043435097,
      "rewards/curriculum_aware_reward_fn": 0.26480263471603394,
      "rewards/format_reward": 0.5546875,
      "step": 98
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 495.3515625,
      "epoch": 1.6722689075630253,
      "grad_norm": 0.5109691619873047,
      "kl": 0.007015228271484375,
      "learning_rate": 1e-06,
      "loss": 0.0351,
      "num_tokens": 12681859.0,
      "reward": 0.4362664595246315,
      "reward_std": 0.0971333347260952,
      "rewards/curriculum_aware_reward_fn": 0.1237664483487606,
      "rewards/format_reward": 0.3125,
      "step": 99
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 478.0703125,
      "epoch": 1.6890756302521008,
      "grad_norm": 0.4189630150794983,
      "kl": 0.0095977783203125,
      "learning_rate": 1e-06,
      "loss": 0.0011,
      "num_tokens": 12801148.0,
      "reward": 0.6920230239629745,
      "reward_std": 0.10883715003728867,
      "rewards/curriculum_aware_reward_fn": 0.19983552768826485,
      "rewards/format_reward": 0.4921875,
      "step": 100
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 460.6875,
      "epoch": 1.7058823529411766,
      "grad_norm": 0.5282026529312134,
      "kl": 0.007904052734375,
      "learning_rate": 1e-06,
      "loss": -0.0042,
      "num_tokens": 12921692.0,
      "reward": 0.3396381661295891,
      "reward_std": 0.11080991290509701,
      "rewards/curriculum_aware_reward_fn": 0.04276315798051655,
      "rewards/format_reward": 0.296875,
      "step": 101
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 554.40625,
      "epoch": 1.7226890756302522,
      "grad_norm": 0.5177521109580994,
      "kl": 0.01079559326171875,
      "learning_rate": 1e-06,
      "loss": -0.009,
      "num_tokens": 13052136.0,
      "reward": 0.36965460516512394,
      "reward_std": 0.10201659612357616,
      "rewards/curriculum_aware_reward_fn": 0.01809210516512394,
      "rewards/format_reward": 0.3515625,
      "step": 102
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 524.421875,
      "epoch": 1.7394957983193278,
      "grad_norm": 0.44328662753105164,
      "kl": 0.008655548095703125,
      "learning_rate": 1e-06,
      "loss": 0.0114,
      "num_tokens": 13178822.0,
      "reward": 0.5349506437778473,
      "reward_std": 0.12413342297077179,
      "rewards/curriculum_aware_reward_fn": 0.058388158679008484,
      "rewards/format_reward": 0.4765625,
      "step": 103
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 446.46875,
      "epoch": 1.7563025210084033,
      "grad_norm": 0.647972583770752,
      "kl": 0.01692962646484375,
      "learning_rate": 1e-06,
      "loss": 0.0047,
      "num_tokens": 13297402.0,
      "reward": 0.6476151421666145,
      "reward_std": 0.22924628667533398,
      "rewards/curriculum_aware_reward_fn": 0.07730263285338879,
      "rewards/format_reward": 0.5703125,
      "step": 104
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 503.703125,
      "epoch": 1.773109243697479,
      "grad_norm": 0.631151556968689,
      "kl": 0.008514404296875,
      "learning_rate": 1e-06,
      "loss": 0.0008,
      "num_tokens": 13418340.0,
      "reward": 0.46299341320991516,
      "reward_std": 0.2022387906908989,
      "rewards/curriculum_aware_reward_fn": 0.06455592066049576,
      "rewards/format_reward": 0.3984375,
      "step": 105
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 562.03125,
      "epoch": 1.7899159663865545,
      "grad_norm": 0.3566150963306427,
      "kl": 0.006641387939453125,
      "learning_rate": 1e-06,
      "loss": -0.0002,
      "num_tokens": 13550952.0,
      "reward": 0.35773025802336633,
      "reward_std": 0.09330996312201023,
      "rewards/curriculum_aware_reward_fn": 0.05304276151582599,
      "rewards/format_reward": 0.3046875,
      "step": 106
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 539.1796875,
      "epoch": 1.8067226890756303,
      "grad_norm": 0.4120214581489563,
      "kl": 0.00933074951171875,
      "learning_rate": 1e-06,
      "loss": 0.0026,
      "num_tokens": 13678999.0,
      "reward": 0.5435855314135551,
      "reward_std": 0.15557273291051388,
      "rewards/curriculum_aware_reward_fn": 0.12171052396297455,
      "rewards/format_reward": 0.421875,
      "step": 107
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 542.1796875,
      "epoch": 1.8235294117647058,
      "grad_norm": 0.36332470178604126,
      "kl": 0.00751495361328125,
      "learning_rate": 1e-06,
      "loss": 0.0098,
      "num_tokens": 13811206.0,
      "reward": 0.48643091320991516,
      "reward_std": 0.13410842791199684,
      "rewards/curriculum_aware_reward_fn": 0.11924342392012477,
      "rewards/format_reward": 0.3671875,
      "step": 108
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 553.953125,
      "epoch": 1.8403361344537816,
      "grad_norm": 0.3152480721473694,
      "kl": 0.00626373291015625,
      "learning_rate": 1e-06,
      "loss": 0.0016,
      "num_tokens": 13945808.0,
      "reward": 0.3293585479259491,
      "reward_std": 0.045257058925926685,
      "rewards/curriculum_aware_reward_fn": 0.016858553048223257,
      "rewards/format_reward": 0.3125,
      "step": 109
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 573.59375,
      "epoch": 1.8571428571428572,
      "grad_norm": 0.2340080589056015,
      "kl": 0.00682830810546875,
      "learning_rate": 1e-06,
      "loss": 0.0071,
      "num_tokens": 14080460.0,
      "reward": 0.3347039520740509,
      "reward_std": 0.038679007440805435,
      "rewards/curriculum_aware_reward_fn": 0.02220394741743803,
      "rewards/format_reward": 0.3125,
      "step": 110
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 463.0390625,
      "epoch": 1.8739495798319328,
      "grad_norm": 0.36526933312416077,
      "kl": 0.009578704833984375,
      "learning_rate": 1e-06,
      "loss": 0.0039,
      "num_tokens": 14201065.0,
      "reward": 0.6328125149011612,
      "reward_std": 0.05027205403894186,
      "rewards/curriculum_aware_reward_fn": 0.07031250046566129,
      "rewards/format_reward": 0.5625,
      "step": 111
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 481.9609375,
      "epoch": 1.8907563025210083,
      "grad_norm": 0.4954119324684143,
      "kl": 0.0100555419921875,
      "learning_rate": 1e-06,
      "loss": 0.0005,
      "num_tokens": 14323068.0,
      "reward": 0.5254934206604958,
      "reward_std": 0.12779070809483528,
      "rewards/curriculum_aware_reward_fn": 0.08799342159181833,
      "rewards/format_reward": 0.4375,
      "step": 112
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 494.96875,
      "epoch": 1.907563025210084,
      "grad_norm": 0.46778982877731323,
      "kl": 0.00978851318359375,
      "learning_rate": 1e-06,
      "loss": 0.0199,
      "num_tokens": 14447008.0,
      "reward": 0.5370065793395042,
      "reward_std": 0.1048955712467432,
      "rewards/curriculum_aware_reward_fn": 0.09950657980516553,
      "rewards/format_reward": 0.4375,
      "step": 113
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 501.6796875,
      "epoch": 1.9243697478991597,
      "grad_norm": 0.3055194616317749,
      "kl": 0.00933074951171875,
      "learning_rate": 1e-06,
      "loss": 0.0032,
      "num_tokens": 14571103.0,
      "reward": 0.5111019909381866,
      "reward_std": 0.024554526433348656,
      "rewards/curriculum_aware_reward_fn": 0.08141447440721095,
      "rewards/format_reward": 0.4296875,
      "step": 114
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 508.8203125,
      "epoch": 1.9411764705882353,
      "grad_norm": 0.4632183611392975,
      "kl": 0.012451171875,
      "learning_rate": 1e-06,
      "loss": 0.0157,
      "num_tokens": 14694424.0,
      "reward": 0.6089638024568558,
      "reward_std": 0.10860061645507812,
      "rewards/curriculum_aware_reward_fn": 0.11677631549537182,
      "rewards/format_reward": 0.4921875,
      "step": 115
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 495.6875,
      "epoch": 1.957983193277311,
      "grad_norm": 0.41369161009788513,
      "kl": 0.0089874267578125,
      "learning_rate": 1e-06,
      "loss": 0.0212,
      "num_tokens": 14819792.0,
      "reward": 0.4621710479259491,
      "reward_std": 0.07010683044791222,
      "rewards/curriculum_aware_reward_fn": 0.0871710479259491,
      "rewards/format_reward": 0.375,
      "step": 116
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 529.359375,
      "epoch": 1.9747899159663866,
      "grad_norm": 0.40478190779685974,
      "kl": 0.012042999267578125,
      "learning_rate": 1e-06,
      "loss": 0.0388,
      "num_tokens": 14946718.0,
      "reward": 0.48190788179636,
      "reward_std": 0.10751516558229923,
      "rewards/curriculum_aware_reward_fn": 0.11472039762884378,
      "rewards/format_reward": 0.3671875,
      "step": 117
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 493.4881134033203,
      "epoch": 1.9915966386554622,
      "grad_norm": 0.3562357425689697,
      "kl": 0.0123748779296875,
      "learning_rate": 1e-06,
      "loss": 0.0141,
      "num_tokens": 15064457.0,
      "reward": 0.6706414446234703,
      "reward_std": 0.101046122610569,
      "rewards/curriculum_aware_reward_fn": 0.05345394788309932,
      "rewards/format_reward": 0.6171875,
      "step": 118
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 517.7578125,
      "epoch": 2.0168067226890756,
      "grad_norm": 0.3487071394920349,
      "kl": 0.0104217529296875,
      "learning_rate": 1e-06,
      "loss": 0.0163,
      "num_tokens": 15191538.0,
      "reward": 0.5201480090618134,
      "reward_std": 0.04716231161728501,
      "rewards/curriculum_aware_reward_fn": 0.02014802652411163,
      "rewards/format_reward": 0.5,
      "step": 119
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 577.765625,
      "epoch": 2.033613445378151,
      "grad_norm": 0.35752227902412415,
      "kl": 0.008148193359375,
      "learning_rate": 1e-06,
      "loss": 0.0108,
      "num_tokens": 15327204.0,
      "reward": 0.42763157933950424,
      "reward_std": 0.09388388879597187,
      "rewards/curriculum_aware_reward_fn": 0.05263157933950424,
      "rewards/format_reward": 0.375,
      "step": 120
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 460.53125,
      "epoch": 2.0504201680672267,
      "grad_norm": 0.5020465850830078,
      "kl": 0.014190673828125,
      "learning_rate": 1e-06,
      "loss": 0.0113,
      "num_tokens": 15447608.0,
      "reward": 0.693256601691246,
      "reward_std": 0.12680460885167122,
      "rewards/curriculum_aware_reward_fn": 0.06825657840818167,
      "rewards/format_reward": 0.625,
      "step": 121
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 526.7890625,
      "epoch": 2.0672268907563027,
      "grad_norm": 0.33090242743492126,
      "kl": 0.00830841064453125,
      "learning_rate": 1e-06,
      "loss": 0.0212,
      "num_tokens": 15577021.0,
      "reward": 0.3022203971631825,
      "reward_std": 0.052566134836524725,
      "rewards/curriculum_aware_reward_fn": 0.0522203971631825,
      "rewards/format_reward": 0.25,
      "step": 122
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 465.390625,
      "epoch": 2.0840336134453783,
      "grad_norm": 0.25564736127853394,
      "kl": 0.018894195556640625,
      "learning_rate": 1e-06,
      "loss": 0.001,
      "num_tokens": 15693543.0,
      "reward": 0.5879934281110764,
      "reward_std": 0.03513536183163524,
      "rewards/curriculum_aware_reward_fn": 0.15830592159181833,
      "rewards/format_reward": 0.4296875,
      "step": 123
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 438.015625,
      "epoch": 2.100840336134454,
      "grad_norm": 0.5210288763046265,
      "kl": 0.0128936767578125,
      "learning_rate": 1e-06,
      "loss": 0.038,
      "num_tokens": 15805441.0,
      "reward": 0.7685032784938812,
      "reward_std": 0.15490676742047071,
      "rewards/curriculum_aware_reward_fn": 0.20600328128784895,
      "rewards/format_reward": 0.5625,
      "step": 124
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 419.3515625,
      "epoch": 2.1176470588235294,
      "grad_norm": 0.48274165391921997,
      "kl": 0.01959228515625,
      "learning_rate": 1e-06,
      "loss": 0.0226,
      "num_tokens": 15913862.0,
      "reward": 0.671875,
      "reward_std": 0.11604671645909548,
      "rewards/curriculum_aware_reward_fn": 0.10937500139698386,
      "rewards/format_reward": 0.5625,
      "step": 125
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 520.8203125,
      "epoch": 2.134453781512605,
      "grad_norm": 0.35000789165496826,
      "kl": 0.0090179443359375,
      "learning_rate": 1e-06,
      "loss": -0.0026,
      "num_tokens": 16041007.0,
      "reward": 0.49794407607987523,
      "reward_std": 0.10071868449449539,
      "rewards/curriculum_aware_reward_fn": 0.12294407980516553,
      "rewards/format_reward": 0.375,
      "step": 126
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 557.6328125,
      "epoch": 2.1512605042016806,
      "grad_norm": 0.5103374719619751,
      "kl": 0.0096435546875,
      "learning_rate": 1e-06,
      "loss": 0.0051,
      "num_tokens": 16173728.0,
      "reward": 0.45641446858644485,
      "reward_std": 0.10976400738582015,
      "rewards/curriculum_aware_reward_fn": 0.08141447091475129,
      "rewards/format_reward": 0.375,
      "step": 127
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 414.9140625,
      "epoch": 2.168067226890756,
      "grad_norm": 0.43994390964508057,
      "kl": 0.014190673828125,
      "learning_rate": 1e-06,
      "loss": -0.0002,
      "num_tokens": 16285445.0,
      "reward": 0.7236842215061188,
      "reward_std": 0.11914092372171581,
      "rewards/curriculum_aware_reward_fn": 0.09868421289138496,
      "rewards/format_reward": 0.625,
      "step": 128
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 567.921875,
      "epoch": 2.184873949579832,
      "grad_norm": 0.319624662399292,
      "kl": 0.0082244873046875,
      "learning_rate": 1e-06,
      "loss": 0.0285,
      "num_tokens": 16420019.0,
      "reward": 0.4259868264198303,
      "reward_std": 0.05608854768797755,
      "rewards/curriculum_aware_reward_fn": 0.11348683759570122,
      "rewards/format_reward": 0.3125,
      "step": 129
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 463.53125,
      "epoch": 2.2016806722689077,
      "grad_norm": 0.359430193901062,
      "kl": 0.014495849609375,
      "learning_rate": 1e-06,
      "loss": 0.0185,
      "num_tokens": 16541143.0,
      "reward": 0.4699835479259491,
      "reward_std": 0.08584295958280563,
      "rewards/curriculum_aware_reward_fn": 0.0949835516512394,
      "rewards/format_reward": 0.375,
      "step": 130
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 469.609375,
      "epoch": 2.2184873949579833,
      "grad_norm": 0.41892191767692566,
      "kl": 0.0117034912109375,
      "learning_rate": 1e-06,
      "loss": 0.0365,
      "num_tokens": 16662909.0,
      "reward": 0.5522204041481018,
      "reward_std": 0.0973742357455194,
      "rewards/curriculum_aware_reward_fn": 0.052220395184122026,
      "rewards/format_reward": 0.5,
      "step": 131
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 540.5703125,
      "epoch": 2.235294117647059,
      "grad_norm": 0.48490580916404724,
      "kl": 0.0093231201171875,
      "learning_rate": 1e-06,
      "loss": -0.0203,
      "num_tokens": 16795070.0,
      "reward": 0.41324013471603394,
      "reward_std": 0.08475807495415211,
      "rewards/curriculum_aware_reward_fn": 0.038240132853388786,
      "rewards/format_reward": 0.375,
      "step": 132
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 525.1796875,
      "epoch": 2.2521008403361344,
      "grad_norm": 0.4449516832828522,
      "kl": 0.0105438232421875,
      "learning_rate": 1e-06,
      "loss": 0.0023,
      "num_tokens": 16923613.0,
      "reward": 0.5604440867900848,
      "reward_std": 0.1288975402712822,
      "rewards/curriculum_aware_reward_fn": 0.12294407933950424,
      "rewards/format_reward": 0.4375,
      "step": 133
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 476.3125,
      "epoch": 2.26890756302521,
      "grad_norm": 0.4340604543685913,
      "kl": 0.01129150390625,
      "learning_rate": 1e-06,
      "loss": -0.028,
      "num_tokens": 17045693.0,
      "reward": 0.5587993413209915,
      "reward_std": 0.09385511744767427,
      "rewards/curriculum_aware_reward_fn": 0.058799343183636665,
      "rewards/format_reward": 0.5,
      "step": 134
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 420.9921875,
      "epoch": 2.2857142857142856,
      "grad_norm": 0.45602235198020935,
      "kl": 0.01416015625,
      "learning_rate": 1e-06,
      "loss": 0.0042,
      "num_tokens": 17154012.0,
      "reward": 0.7602795735001564,
      "reward_std": 0.09590415796265006,
      "rewards/curriculum_aware_reward_fn": 0.14309210563078523,
      "rewards/format_reward": 0.6171875,
      "step": 135
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 489.6640625,
      "epoch": 2.302521008403361,
      "grad_norm": 0.4504002332687378,
      "kl": 0.0130157470703125,
      "learning_rate": 1e-06,
      "loss": -0.0126,
      "num_tokens": 17274481.0,
      "reward": 0.6295230239629745,
      "reward_std": 0.15420474018901587,
      "rewards/curriculum_aware_reward_fn": 0.13733552629128098,
      "rewards/format_reward": 0.4921875,
      "step": 136
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 492.3046875,
      "epoch": 2.3193277310924367,
      "grad_norm": 0.3228984773159027,
      "kl": 0.0111846923828125,
      "learning_rate": 1e-06,
      "loss": -0.004,
      "num_tokens": 17399360.0,
      "reward": 0.5587993413209915,
      "reward_std": 0.0586426155641675,
      "rewards/curriculum_aware_reward_fn": 0.05879934271797538,
      "rewards/format_reward": 0.5,
      "step": 137
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 508.5625,
      "epoch": 2.3361344537815127,
      "grad_norm": 0.3110595643520355,
      "kl": 0.015472412109375,
      "learning_rate": 1e-06,
      "loss": -0.0107,
      "num_tokens": 17521248.0,
      "reward": 0.546875,
      "reward_std": 0.07312605157494545,
      "rewards/curriculum_aware_reward_fn": 0.0546875,
      "rewards/format_reward": 0.4921875,
      "step": 138
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 574.015625,
      "epoch": 2.3529411764705883,
      "grad_norm": 0.4071909487247467,
      "kl": 0.0107421875,
      "learning_rate": 1e-06,
      "loss": -0.0034,
      "num_tokens": 17659522.0,
      "reward": 0.47450655698776245,
      "reward_std": 0.07414581999182701,
      "rewards/curriculum_aware_reward_fn": 0.03700657980516553,
      "rewards/format_reward": 0.4375,
      "step": 139
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 522.5546875,
      "epoch": 2.369747899159664,
      "grad_norm": 0.34431034326553345,
      "kl": 0.00946044921875,
      "learning_rate": 1e-06,
      "loss": -0.0068,
      "num_tokens": 17788537.0,
      "reward": 0.4099506512284279,
      "reward_std": 0.05903024738654494,
      "rewards/curriculum_aware_reward_fn": 0.0349506571656093,
      "rewards/format_reward": 0.375,
      "step": 140
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 553.2421875,
      "epoch": 2.3865546218487395,
      "grad_norm": 0.4213170111179352,
      "kl": 0.009979248046875,
      "learning_rate": 1e-06,
      "loss": 0.0132,
      "num_tokens": 17918288.0,
      "reward": 0.4177631586790085,
      "reward_std": 0.08044615527614951,
      "rewards/curriculum_aware_reward_fn": 0.042763158096931875,
      "rewards/format_reward": 0.375,
      "step": 141
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 559.3203125,
      "epoch": 2.403361344537815,
      "grad_norm": 0.23342828452587128,
      "kl": 0.008510589599609375,
      "learning_rate": 1e-06,
      "loss": -0.0035,
      "num_tokens": 18052169.0,
      "reward": 0.3762335553765297,
      "reward_std": 0.03740033693611622,
      "rewards/curriculum_aware_reward_fn": 0.07154605258256197,
      "rewards/format_reward": 0.3046875,
      "step": 142
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 493.078125,
      "epoch": 2.4201680672268906,
      "grad_norm": 0.4362901449203491,
      "kl": 0.012481689453125,
      "learning_rate": 1e-06,
      "loss": 0.0386,
      "num_tokens": 18177251.0,
      "reward": 0.5805921033024788,
      "reward_std": 0.12307591829448938,
      "rewards/curriculum_aware_reward_fn": 0.08840460516512394,
      "rewards/format_reward": 0.4921875,
      "step": 143
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 435.65625,
      "epoch": 2.4369747899159666,
      "grad_norm": 0.6844424605369568,
      "kl": 0.0600128173828125,
      "learning_rate": 1e-06,
      "loss": 0.0209,
      "num_tokens": 18292007.0,
      "reward": 0.6208881735801697,
      "reward_std": 0.15131067298352718,
      "rewards/curriculum_aware_reward_fn": 0.12088816147297621,
      "rewards/format_reward": 0.5,
      "step": 144
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 490.296875,
      "epoch": 2.453781512605042,
      "grad_norm": 0.30699044466018677,
      "kl": 0.010986328125,
      "learning_rate": 1e-06,
      "loss": 0.0248,
      "num_tokens": 18415301.0,
      "reward": 0.49424342066049576,
      "reward_std": 0.04014611290767789,
      "rewards/curriculum_aware_reward_fn": 0.05674342147540301,
      "rewards/format_reward": 0.4375,
      "step": 145
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 471.984375,
      "epoch": 2.4705882352941178,
      "grad_norm": 0.403209924697876,
      "kl": 0.0122528076171875,
      "learning_rate": 1e-06,
      "loss": 0.0353,
      "num_tokens": 18532667.0,
      "reward": 0.6027960330247879,
      "reward_std": 0.0935791190713644,
      "rewards/curriculum_aware_reward_fn": 0.1027960553765297,
      "rewards/format_reward": 0.5,
      "step": 146
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 422.703125,
      "epoch": 2.4873949579831933,
      "grad_norm": 0.42733973264694214,
      "kl": 0.0163116455078125,
      "learning_rate": 1e-06,
      "loss": -0.002,
      "num_tokens": 18645941.0,
      "reward": 0.7845394462347031,
      "reward_std": 0.0871797576546669,
      "rewards/curriculum_aware_reward_fn": 0.0345394741743803,
      "rewards/format_reward": 0.75,
      "step": 147
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 548.828125,
      "epoch": 2.504201680672269,
      "grad_norm": 0.2545667290687561,
      "kl": 0.01213836669921875,
      "learning_rate": 1e-06,
      "loss": 0.0089,
      "num_tokens": 18774111.0,
      "reward": 0.539473682641983,
      "reward_std": 0.060992954298853874,
      "rewards/curriculum_aware_reward_fn": 0.10197368077933788,
      "rewards/format_reward": 0.4375,
      "step": 148
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 540.0703125,
      "epoch": 2.5210084033613445,
      "grad_norm": 0.3914143145084381,
      "kl": 0.00789642333984375,
      "learning_rate": 1e-06,
      "loss": -0.0052,
      "num_tokens": 18904992.0,
      "reward": 0.27878287341445684,
      "reward_std": 0.06910991575568914,
      "rewards/curriculum_aware_reward_fn": 0.02878289413638413,
      "rewards/format_reward": 0.25,
      "step": 149
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 550.9375,
      "epoch": 2.53781512605042,
      "grad_norm": 0.2912365794181824,
      "kl": 0.00799560546875,
      "learning_rate": 1e-06,
      "loss": 0.0067,
      "num_tokens": 19035968.0,
      "reward": 0.3215460553765297,
      "reward_std": 0.01937512680888176,
      "rewards/curriculum_aware_reward_fn": 0.07154605211690068,
      "rewards/format_reward": 0.25,
      "step": 150
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 471.1640625,
      "epoch": 2.5546218487394956,
      "grad_norm": 0.3965752124786377,
      "kl": 0.01221466064453125,
      "learning_rate": 1e-06,
      "loss": 0.0062,
      "num_tokens": 19153861.0,
      "reward": 0.582648016512394,
      "reward_std": 0.08400850929319859,
      "rewards/curriculum_aware_reward_fn": 0.08264802652411163,
      "rewards/format_reward": 0.5,
      "step": 151
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 421.234375,
      "epoch": 2.571428571428571,
      "grad_norm": 0.6044662594795227,
      "kl": 0.026885986328125,
      "learning_rate": 1e-06,
      "loss": 0.0165,
      "num_tokens": 19265379.0,
      "reward": 0.8112664222717285,
      "reward_std": 0.1459241509437561,
      "rewards/curriculum_aware_reward_fn": 0.19407895021140575,
      "rewards/format_reward": 0.6171875,
      "step": 152
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 546.8671875,
      "epoch": 2.588235294117647,
      "grad_norm": 0.4222107231616974,
      "kl": 0.01050567626953125,
      "learning_rate": 1e-06,
      "loss": 0.0261,
      "num_tokens": 19396626.0,
      "reward": 0.38733551651239395,
      "reward_std": 0.06776260398328304,
      "rewards/curriculum_aware_reward_fn": 0.02014802605845034,
      "rewards/format_reward": 0.3671875,
      "step": 153
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 548.875,
      "epoch": 2.6050420168067228,
      "grad_norm": 0.30043891072273254,
      "kl": 0.010498046875,
      "learning_rate": 1e-06,
      "loss": 0.0291,
      "num_tokens": 19531202.0,
      "reward": 0.28166119009256363,
      "reward_std": 0.07623977493494749,
      "rewards/curriculum_aware_reward_fn": 0.031661184038966894,
      "rewards/format_reward": 0.25,
      "step": 154
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 560.640625,
      "epoch": 2.6218487394957983,
      "grad_norm": 0.39753058552742004,
      "kl": 0.0109710693359375,
      "learning_rate": 1e-06,
      "loss": 0.0213,
      "num_tokens": 19665404.0,
      "reward": 0.5197368338704109,
      "reward_std": 0.08217737264931202,
      "rewards/curriculum_aware_reward_fn": 0.08223684225231409,
      "rewards/format_reward": 0.4375,
      "step": 155
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 481.6640625,
      "epoch": 2.638655462184874,
      "grad_norm": 0.39810478687286377,
      "kl": 0.009063720703125,
      "learning_rate": 1e-06,
      "loss": 0.0044,
      "num_tokens": 19787409.0,
      "reward": 0.44202302396297455,
      "reward_std": 0.08141717128455639,
      "rewards/curriculum_aware_reward_fn": 0.12952302768826485,
      "rewards/format_reward": 0.3125,
      "step": 156
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 389.7890625,
      "epoch": 2.6554621848739495,
      "grad_norm": 0.4911426305770874,
      "kl": 0.02197265625,
      "learning_rate": 1e-06,
      "loss": 0.0028,
      "num_tokens": 19896190.0,
      "reward": 0.7331414520740509,
      "reward_std": 0.17763726785779,
      "rewards/curriculum_aware_reward_fn": 0.1784539483487606,
      "rewards/format_reward": 0.5546875,
      "step": 157
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 518.9609375,
      "epoch": 2.6722689075630255,
      "grad_norm": 0.2420579046010971,
      "kl": 0.011962890625,
      "learning_rate": 1e-06,
      "loss": 0.0303,
      "num_tokens": 20022809.0,
      "reward": 0.4453125,
      "reward_std": 0.01647413382306695,
      "rewards/curriculum_aware_reward_fn": 0.007812500232830644,
      "rewards/format_reward": 0.4375,
      "step": 158
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 424.4453125,
      "epoch": 2.689075630252101,
      "grad_norm": 0.46578091382980347,
      "kl": 0.01375579833984375,
      "learning_rate": 1e-06,
      "loss": 0.0011,
      "num_tokens": 20136314.0,
      "reward": 0.49095392785966396,
      "reward_std": 0.13701673224568367,
      "rewards/curriculum_aware_reward_fn": 0.06126644625328481,
      "rewards/format_reward": 0.4296875,
      "step": 159
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 483.921875,
      "epoch": 2.7058823529411766,
      "grad_norm": 0.32379522919654846,
      "kl": 0.01180267333984375,
      "learning_rate": 1e-06,
      "loss": 0.0065,
      "num_tokens": 20257344.0,
      "reward": 0.5197368343360722,
      "reward_std": 0.07396957790479064,
      "rewards/curriculum_aware_reward_fn": 0.0822368417866528,
      "rewards/format_reward": 0.4375,
      "step": 160
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 470.3515625,
      "epoch": 2.722689075630252,
      "grad_norm": 0.4478832483291626,
      "kl": 0.014068603515625,
      "learning_rate": 1e-06,
      "loss": -0.0003,
      "num_tokens": 20375685.0,
      "reward": 0.5801809281110764,
      "reward_std": 0.06543473433703184,
      "rewards/curriculum_aware_reward_fn": 0.08018092112615705,
      "rewards/format_reward": 0.5,
      "step": 161
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 462.3046875,
      "epoch": 2.7394957983193278,
      "grad_norm": 0.4915456175804138,
      "kl": 0.0140838623046875,
      "learning_rate": 1e-06,
      "loss": 0.0286,
      "num_tokens": 20491340.0,
      "reward": 0.6981907933950424,
      "reward_std": 0.1432387800887227,
      "rewards/curriculum_aware_reward_fn": 0.13569078594446182,
      "rewards/format_reward": 0.5625,
      "step": 162
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 460.046875,
      "epoch": 2.7563025210084033,
      "grad_norm": 0.388621062040329,
      "kl": 0.0123138427734375,
      "learning_rate": 1e-06,
      "loss": 0.0144,
      "num_tokens": 20613466.0,
      "reward": 0.4124177619814873,
      "reward_std": 0.07370226783677936,
      "rewards/curriculum_aware_reward_fn": 0.037417763262055814,
      "rewards/format_reward": 0.375,
      "step": 163
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 462.25,
      "epoch": 2.773109243697479,
      "grad_norm": 0.4878624677658081,
      "kl": 0.01593017578125,
      "learning_rate": 1e-06,
      "loss": -0.0006,
      "num_tokens": 20729058.0,
      "reward": 0.6221217066049576,
      "reward_std": 0.12872529029846191,
      "rewards/curriculum_aware_reward_fn": 0.12212171033024788,
      "rewards/format_reward": 0.5,
      "step": 164
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 486.4609375,
      "epoch": 2.7899159663865545,
      "grad_norm": 0.4500262141227722,
      "kl": 0.0099029541015625,
      "learning_rate": 1e-06,
      "loss": 0.0219,
      "num_tokens": 20853869.0,
      "reward": 0.4050164371728897,
      "reward_std": 0.11422262340784073,
      "rewards/curriculum_aware_reward_fn": 0.09251644648611546,
      "rewards/format_reward": 0.3125,
      "step": 165
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 448.5859375,
      "epoch": 2.80672268907563,
      "grad_norm": 0.5006850957870483,
      "kl": 0.0168914794921875,
      "learning_rate": 1e-06,
      "loss": 0.0135,
      "num_tokens": 20973736.0,
      "reward": 0.677631601691246,
      "reward_std": 0.0868874522857368,
      "rewards/curriculum_aware_reward_fn": 0.12294407980516553,
      "rewards/format_reward": 0.5546875,
      "step": 166
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 428.7265625,
      "epoch": 2.8235294117647056,
      "grad_norm": 0.42931458353996277,
      "kl": 0.01781463623046875,
      "learning_rate": 1e-06,
      "loss": 0.0042,
      "num_tokens": 21086485.0,
      "reward": 0.6040295884013176,
      "reward_std": 0.05929867131635547,
      "rewards/curriculum_aware_reward_fn": 0.041529607493430376,
      "rewards/format_reward": 0.5625,
      "step": 167
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 388.421875,
      "epoch": 2.8403361344537816,
      "grad_norm": 0.44046640396118164,
      "kl": 0.0161895751953125,
      "learning_rate": 1e-06,
      "loss": 0.01,
      "num_tokens": 21193627.0,
      "reward": 0.7483552545309067,
      "reward_std": 0.09682157123461366,
      "rewards/curriculum_aware_reward_fn": 0.060855262679979205,
      "rewards/format_reward": 0.6875,
      "step": 168
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 477.6953125,
      "epoch": 2.857142857142857,
      "grad_norm": 0.36667370796203613,
      "kl": 0.0146484375,
      "learning_rate": 1e-06,
      "loss": -0.0002,
      "num_tokens": 21313716.0,
      "reward": 0.6060855239629745,
      "reward_std": 0.10079656913876534,
      "rewards/curriculum_aware_reward_fn": 0.11389802675694227,
      "rewards/format_reward": 0.4921875,
      "step": 169
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 534.4296875,
      "epoch": 2.8739495798319328,
      "grad_norm": 0.3436344563961029,
      "kl": 0.00984954833984375,
      "learning_rate": 1e-06,
      "loss": 0.0022,
      "num_tokens": 21445667.0,
      "reward": 0.48231907933950424,
      "reward_std": 0.08960662921890616,
      "rewards/curriculum_aware_reward_fn": 0.10731907980516553,
      "rewards/format_reward": 0.375,
      "step": 170
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 496.21875,
      "epoch": 2.8907563025210083,
      "grad_norm": 0.48088422417640686,
      "kl": 0.0130767822265625,
      "learning_rate": 1e-06,
      "loss": 0.0182,
      "num_tokens": 21570871.0,
      "reward": 0.4465460618957877,
      "reward_std": 0.1538134217262268,
      "rewards/curriculum_aware_reward_fn": 0.13404605071991682,
      "rewards/format_reward": 0.3125,
      "step": 171
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 429.484375,
      "epoch": 2.907563025210084,
      "grad_norm": 0.5827536582946777,
      "kl": 0.016109466552734375,
      "learning_rate": 1e-06,
      "loss": 0.0127,
      "num_tokens": 21686093.0,
      "reward": 0.4502467103302479,
      "reward_std": 0.15407454315572977,
      "rewards/curriculum_aware_reward_fn": 0.09087171172723174,
      "rewards/format_reward": 0.359375,
      "step": 172
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 482.4609375,
      "epoch": 2.92436974789916,
      "grad_norm": 0.467061311006546,
      "kl": 0.013336181640625,
      "learning_rate": 1e-06,
      "loss": 0.033,
      "num_tokens": 21808264.0,
      "reward": 0.6632401347160339,
      "reward_std": 0.10484125558286905,
      "rewards/curriculum_aware_reward_fn": 0.22574013099074364,
      "rewards/format_reward": 0.4375,
      "step": 173
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 500.96875,
      "epoch": 2.9411764705882355,
      "grad_norm": 0.41948550939559937,
      "kl": 0.009563446044921875,
      "learning_rate": 1e-06,
      "loss": 0.0329,
      "num_tokens": 21933084.0,
      "reward": 0.400082241743803,
      "reward_std": 0.10662292037159204,
      "rewards/curriculum_aware_reward_fn": 0.04070723708719015,
      "rewards/format_reward": 0.359375,
      "step": 174
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 557.640625,
      "epoch": 2.957983193277311,
      "grad_norm": 0.41708114743232727,
      "kl": 0.007190704345703125,
      "learning_rate": 1e-06,
      "loss": 0.021,
      "num_tokens": 22068550.0,
      "reward": 0.3005756618222222,
      "reward_std": 0.06424513552337885,
      "rewards/curriculum_aware_reward_fn": 0.050575657514855266,
      "rewards/format_reward": 0.25,
      "step": 175
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 482.421875,
      "epoch": 2.9747899159663866,
      "grad_norm": 0.6009016633033752,
      "kl": 0.013702392578125,
      "learning_rate": 1e-06,
      "loss": 0.0048,
      "num_tokens": 22189356.0,
      "reward": 0.6620065569877625,
      "reward_std": 0.149446252733469,
      "rewards/curriculum_aware_reward_fn": 0.16200657933950424,
      "rewards/format_reward": 0.5,
      "step": 176
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 490.53572845458984,
      "epoch": 2.991596638655462,
      "grad_norm": 0.49134695529937744,
      "kl": 0.01397705078125,
      "learning_rate": 1e-06,
      "loss": -0.0025,
      "num_tokens": 22309028.0,
      "reward": 0.6726973652839661,
      "reward_std": 0.14456172287464142,
      "rewards/curriculum_aware_reward_fn": 0.1101973676122725,
      "rewards/format_reward": 0.5625,
      "step": 177
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 358.109375,
      "epoch": 3.0168067226890756,
      "grad_norm": 0.5925723314285278,
      "kl": 0.0204315185546875,
      "learning_rate": 1e-06,
      "loss": 0.0117,
      "num_tokens": 22410794.0,
      "reward": 0.87787826359272,
      "reward_std": 0.1721474528312683,
      "rewards/curriculum_aware_reward_fn": 0.19819078594446182,
      "rewards/format_reward": 0.6796875,
      "step": 178
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 523.0078125,
      "epoch": 3.033613445378151,
      "grad_norm": 0.2975535988807678,
      "kl": 0.01165771484375,
      "learning_rate": 1e-06,
      "loss": 0.0697,
      "num_tokens": 22539299.0,
      "reward": 0.5168585479259491,
      "reward_std": 0.048361226450651884,
      "rewards/curriculum_aware_reward_fn": 0.08717105106916279,
      "rewards/format_reward": 0.4296875,
      "step": 179
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 484.7578125,
      "epoch": 3.0504201680672267,
      "grad_norm": 0.45362988114356995,
      "kl": 0.0162200927734375,
      "learning_rate": 1e-06,
      "loss": 0.0023,
      "num_tokens": 22660588.0,
      "reward": 0.5513980276882648,
      "reward_std": 0.1047646040096879,
      "rewards/curriculum_aware_reward_fn": 0.06702302722260356,
      "rewards/format_reward": 0.484375,
      "step": 180
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 452.9375,
      "epoch": 3.0672268907563027,
      "grad_norm": 0.5003635883331299,
      "kl": 0.0143890380859375,
      "learning_rate": 1e-06,
      "loss": 0.0104,
      "num_tokens": 22778956.0,
      "reward": 0.73149673640728,
      "reward_std": 0.17891032248735428,
      "rewards/curriculum_aware_reward_fn": 0.23149671405553818,
      "rewards/format_reward": 0.5,
      "step": 181
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 513.0390625,
      "epoch": 3.0840336134453783,
      "grad_norm": 0.31615540385246277,
      "kl": 0.01172637939453125,
      "learning_rate": 1e-06,
      "loss": 0.0235,
      "num_tokens": 22905121.0,
      "reward": 0.3244243338704109,
      "reward_std": 0.03051401791162789,
      "rewards/curriculum_aware_reward_fn": 0.011924341786652803,
      "rewards/format_reward": 0.3125,
      "step": 182
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 411.6015625,
      "epoch": 3.100840336134454,
      "grad_norm": 0.4836508631706238,
      "kl": 0.0144195556640625,
      "learning_rate": 1e-06,
      "loss": 0.0043,
      "num_tokens": 23019342.0,
      "reward": 0.5826480239629745,
      "reward_std": 0.11801502481102943,
      "rewards/curriculum_aware_reward_fn": 0.07483552675694227,
      "rewards/format_reward": 0.5078125,
      "step": 183
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 416.921875,
      "epoch": 3.1176470588235294,
      "grad_norm": 0.3468119204044342,
      "kl": 0.01403045654296875,
      "learning_rate": 1e-06,
      "loss": -0.0007,
      "num_tokens": 23137316.0,
      "reward": 0.47574012295808643,
      "reward_std": 0.05907326890155673,
      "rewards/curriculum_aware_reward_fn": 0.10074013040866703,
      "rewards/format_reward": 0.375,
      "step": 184
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 414.359375,
      "epoch": 3.134453781512605,
      "grad_norm": 0.4667985439300537,
      "kl": 0.0151519775390625,
      "learning_rate": 1e-06,
      "loss": 0.0299,
      "num_tokens": 23249858.0,
      "reward": 0.6344572305679321,
      "reward_std": 0.15162191167473793,
      "rewards/curriculum_aware_reward_fn": 0.13445723662152886,
      "rewards/format_reward": 0.5,
      "step": 185
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 482.109375,
      "epoch": 3.1512605042016806,
      "grad_norm": 0.4111727774143219,
      "kl": 0.013458251953125,
      "learning_rate": 1e-06,
      "loss": 0.0121,
      "num_tokens": 23370304.0,
      "reward": 0.4806743413209915,
      "reward_std": 0.052865433506667614,
      "rewards/curriculum_aware_reward_fn": 0.0431743401568383,
      "rewards/format_reward": 0.4375,
      "step": 186
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 500.5234375,
      "epoch": 3.168067226890756,
      "grad_norm": 0.4427432715892792,
      "kl": 0.01395416259765625,
      "learning_rate": 1e-06,
      "loss": -0.0198,
      "num_tokens": 23496979.0,
      "reward": 0.4243420949205756,
      "reward_std": 0.07115951599553227,
      "rewards/curriculum_aware_reward_fn": 0.11184210795909166,
      "rewards/format_reward": 0.3125,
      "step": 187
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 450.671875,
      "epoch": 3.184873949579832,
      "grad_norm": 0.4217956066131592,
      "kl": 0.0164947509765625,
      "learning_rate": 1e-06,
      "loss": 0.0361,
      "num_tokens": 23613281.0,
      "reward": 0.5629111751914024,
      "reward_std": 0.07686262531206012,
      "rewards/curriculum_aware_reward_fn": 0.12541118264198303,
      "rewards/format_reward": 0.4375,
      "step": 188
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 399.21875,
      "epoch": 3.2016806722689077,
      "grad_norm": 0.6111953258514404,
      "kl": 0.017852783203125,
      "learning_rate": 1e-06,
      "loss": 0.0248,
      "num_tokens": 23723725.0,
      "reward": 0.7121710479259491,
      "reward_std": 0.15234812535345554,
      "rewards/curriculum_aware_reward_fn": 0.08717105351388454,
      "rewards/format_reward": 0.625,
      "step": 189
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 433.015625,
      "epoch": 3.2184873949579833,
      "grad_norm": 0.4865033030509949,
      "kl": 0.0166778564453125,
      "learning_rate": 1e-06,
      "loss": -0.009,
      "num_tokens": 23835815.0,
      "reward": 0.7372532933950424,
      "reward_std": 0.13220055866986513,
      "rewards/curriculum_aware_reward_fn": 0.17475328966975212,
      "rewards/format_reward": 0.5625,
      "step": 190
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 525.4765625,
      "epoch": 3.235294117647059,
      "grad_norm": 0.3422640562057495,
      "kl": 0.016204833984375,
      "learning_rate": 1e-06,
      "loss": 0.0136,
      "num_tokens": 23964596.0,
      "reward": 0.43174342811107635,
      "reward_std": 0.09182633552700281,
      "rewards/curriculum_aware_reward_fn": 0.05674342066049576,
      "rewards/format_reward": 0.375,
      "step": 191
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 351.4453125,
      "epoch": 3.2521008403361344,
      "grad_norm": 0.5189781785011292,
      "kl": 0.023193359375,
      "learning_rate": 1e-06,
      "loss": 0.0349,
      "num_tokens": 24067141.0,
      "reward": 0.7643914222717285,
      "reward_std": 0.15736807510256767,
      "rewards/curriculum_aware_reward_fn": 0.1940789488144219,
      "rewards/format_reward": 0.5703125,
      "step": 192
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 460.296875,
      "epoch": 3.26890756302521,
      "grad_norm": 0.36804094910621643,
      "kl": 0.012298583984375,
      "learning_rate": 1e-06,
      "loss": -0.0044,
      "num_tokens": 24187067.0,
      "reward": 0.48273026943206787,
      "reward_std": 0.037970013450831175,
      "rewards/curriculum_aware_reward_fn": 0.10773026570677757,
      "rewards/format_reward": 0.375,
      "step": 193
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 418.078125,
      "epoch": 3.2857142857142856,
      "grad_norm": 0.4727684259414673,
      "kl": 0.01959228515625,
      "learning_rate": 1e-06,
      "loss": 0.0229,
      "num_tokens": 24300605.0,
      "reward": 0.5563322491943836,
      "reward_std": 0.06251880899071693,
      "rewards/curriculum_aware_reward_fn": 0.13445723743643612,
      "rewards/format_reward": 0.421875,
      "step": 194
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 498.9765625,
      "epoch": 3.302521008403361,
      "grad_norm": 0.5195404887199402,
      "kl": 0.01263427734375,
      "learning_rate": 1e-06,
      "loss": 0.0303,
      "num_tokens": 24427482.0,
      "reward": 0.38569077104330063,
      "reward_std": 0.10553359193727374,
      "rewards/curriculum_aware_reward_fn": 0.07319078966975212,
      "rewards/format_reward": 0.3125,
      "step": 195
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 448.625,
      "epoch": 3.3193277310924367,
      "grad_norm": 0.49932360649108887,
      "kl": 0.017852783203125,
      "learning_rate": 1e-06,
      "loss": 0.0397,
      "num_tokens": 24541746.0,
      "reward": 0.5193256512284279,
      "reward_std": 0.10704736225306988,
      "rewards/curriculum_aware_reward_fn": 0.08182565891183913,
      "rewards/format_reward": 0.4375,
      "step": 196
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 377.1015625,
      "epoch": 3.3361344537815127,
      "grad_norm": 0.4484708309173584,
      "kl": 0.01934814453125,
      "learning_rate": 1e-06,
      "loss": 0.0074,
      "num_tokens": 24647103.0,
      "reward": 0.6673519611358643,
      "reward_std": 0.06431722524575889,
      "rewards/curriculum_aware_reward_fn": 0.16735197603702545,
      "rewards/format_reward": 0.5,
      "step": 197
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 483.921875,
      "epoch": 3.3529411764705883,
      "grad_norm": 0.41696909070014954,
      "kl": 0.01053619384765625,
      "learning_rate": 1e-06,
      "loss": 0.0158,
      "num_tokens": 24773253.0,
      "reward": 0.2717927638441324,
      "reward_std": 0.11790546495467424,
      "rewards/curriculum_aware_reward_fn": 0.08429276384413242,
      "rewards/format_reward": 0.1875,
      "step": 198
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 380.0546875,
      "epoch": 3.369747899159664,
      "grad_norm": 0.45737817883491516,
      "kl": 0.022186279296875,
      "learning_rate": 1e-06,
      "loss": 0.0056,
      "num_tokens": 24881188.0,
      "reward": 0.7002467140555382,
      "reward_std": 0.03508220613002777,
      "rewards/curriculum_aware_reward_fn": 0.1377467131242156,
      "rewards/format_reward": 0.5625,
      "step": 199
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 390.3671875,
      "epoch": 3.3865546218487395,
      "grad_norm": 0.5029156804084778,
      "kl": 0.0277099609375,
      "learning_rate": 1e-06,
      "loss": 0.004,
      "num_tokens": 24991955.0,
      "reward": 0.6694078892469406,
      "reward_std": 0.10574874095618725,
      "rewards/curriculum_aware_reward_fn": 0.10690789762884378,
      "rewards/format_reward": 0.5625,
      "step": 200
    },
    {
      "epoch": 3.3865546218487395,
      "step": 200,
      "total_flos": 0.0,
      "train_loss": 0.010024200768093579,
      "train_runtime": 35564.3846,
      "train_samples_per_second": 0.72,
      "train_steps_per_second": 0.006
    }
  ],
  "logging_steps": 1,
  "max_steps": 200,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 4,
  "save_steps": 50,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 8,
  "trial_name": null,
  "trial_params": null
}