{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 1.994871794871795,
  "eval_steps": 500,
  "global_step": 194,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.428125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 249.8,
      "completions/mean_length": 231.165625,
      "completions/mean_terminated_length": 170.26304931640624,
      "completions/min_length": 32.4,
      "completions/min_terminated_length": 32.4,
      "epoch": 0.05128205128205128,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.7885048389434814,
      "kl": 0.0009005711530335247,
      "learning_rate": 6.857142857142857e-06,
      "loss": 0.0433,
      "num_tokens": 190516.0,
      "reward": -3022.819580078125,
      "reward_std": 900.1877319335938,
      "rewards/wrapper/mean": -755.704880669713,
      "rewards/wrapper/std": 323.2347017750144,
      "step": 5
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.328125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 250.0,
      "completions/mean_length": 216.6640625,
      "completions/mean_terminated_length": 138.70114440917968,
      "completions/min_length": 9.2,
      "completions/min_terminated_length": 9.2,
      "epoch": 0.10256410256410256,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 17.930580139160156,
      "kl": 0.5607756731566041,
      "learning_rate": 1.5428571428571428e-05,
      "loss": 0.102,
      "num_tokens": 370555.0,
      "reward": -2350.24609375,
      "reward_std": 1036.3879638671874,
      "rewards/wrapper/mean": -587.5615524947643,
      "rewards/wrapper/std": 343.1748051315546,
      "step": 10
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.31875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 253.2,
      "completions/mean_length": 215.7828125,
      "completions/mean_terminated_length": 137.66307067871094,
      "completions/min_length": 10.6,
      "completions/min_terminated_length": 10.6,
      "epoch": 0.15384615384615385,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.9885616302490234,
      "kl": 0.1523141123354435,
      "learning_rate": 2.4e-05,
      "loss": 0.087,
      "num_tokens": 551026.0,
      "reward": -1799.3174072265624,
      "reward_std": 753.4510498046875,
      "rewards/wrapper/mean": -449.82934576869013,
      "rewards/wrapper/std": 268.9517418012023,
      "step": 15
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.01875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 253.2,
      "completions/mean_length": 192.778125,
      "completions/mean_terminated_length": 127.8021957397461,
      "completions/min_length": 4.8,
      "completions/min_terminated_length": 4.8,
      "epoch": 0.20512820512820512,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.276989936828613,
      "kl": 0.28207572996616365,
      "learning_rate": 3.2571428571428566e-05,
      "loss": 0.15,
      "num_tokens": 714888.0,
      "reward": -1380.6512451171875,
      "reward_std": 699.9352905273438,
      "rewards/wrapper/mean": -345.1628050208092,
      "rewards/wrapper/std": 243.51311295181512,
      "step": 20
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.825,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 252.0,
      "completions/mean_length": 97.496875,
      "completions/mean_terminated_length": 84.21928253173829,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.2564102564102564,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 951.533935546875,
      "kl": 2.6448661953210832,
      "learning_rate": 4.1142857142857146e-05,
      "loss": 0.2845,
      "num_tokens": 820914.0,
      "reward": -583.757763671875,
      "reward_std": 387.18121032714845,
      "rewards/wrapper/mean": -145.93943485021592,
      "rewards/wrapper/std": 161.7167808920145,
      "step": 25
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.984375,
      "completions/max_length": 253.8,
      "completions/max_terminated_length": 205.2,
      "completions/mean_length": 49.8390625,
      "completions/mean_terminated_length": 48.195822143554686,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3076923076923077,
      "frac_reward_zero_std": 0.034375,
      "grad_norm": 37.3953742980957,
      "kl": 1.2820908799767494,
      "learning_rate": 4.971428571428572e-05,
      "loss": 0.097,
      "num_tokens": 895293.0,
      "reward": -340.1655334472656,
      "reward_std": 238.6476287841797,
      "rewards/wrapper/mean": -85.0413837403059,
      "rewards/wrapper/std": 130.81275693774222,
      "step": 30
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.99375,
      "completions/max_length": 226.8,
      "completions/max_terminated_length": 201.8,
      "completions/mean_length": 52.6359375,
      "completions/mean_terminated_length": 51.99286422729492,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.358974358974359,
      "frac_reward_zero_std": 0.003125,
      "grad_norm": 3.4364609718322754,
      "kl": 0.8841245293617248,
      "learning_rate": 5.8285714285714284e-05,
      "loss": 0.1376,
      "num_tokens": 969890.0,
      "reward": -330.9278930664062,
      "reward_std": 251.6781036376953,
      "rewards/wrapper/mean": -82.73197320103645,
      "rewards/wrapper/std": 117.19065331816674,
      "step": 35
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.99375,
      "completions/max_length": 185.0,
      "completions/max_terminated_length": 179.8,
      "completions/mean_length": 56.6375,
      "completions/mean_terminated_length": 56.020660400390625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.41025641025641024,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.0956766605377197,
      "kl": 0.6315083980560303,
      "learning_rate": 5.992976526832631e-05,
      "loss": 0.1093,
      "num_tokens": 1048146.0,
      "reward": -239.1210479736328,
      "reward_std": 183.56537475585938,
      "rewards/wrapper/mean": -59.78026267290115,
      "rewards/wrapper/std": 73.05939166247845,
      "step": 40
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0,
      "completions/max_length": 183.0,
      "completions/max_terminated_length": 183.0,
      "completions/mean_length": 56.0390625,
      "completions/mean_terminated_length": 56.0390625,
      "completions/min_length": 5.2,
      "completions/min_terminated_length": 5.2,
      "epoch": 0.46153846153846156,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.076198101043701,
      "kl": 2.5414538830518723,
      "learning_rate": 5.9645188003820935e-05,
      "loss": 0.1353,
      "num_tokens": 1125149.0,
      "reward": -128.92584991455078,
      "reward_std": 117.70238952636718,
      "rewards/wrapper/mean": -32.23146015405655,
      "rewards/wrapper/std": 53.83595065623522,
      "step": 45
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0,
      "completions/max_length": 126.8,
      "completions/max_terminated_length": 126.8,
      "completions/mean_length": 47.9640625,
      "completions/mean_terminated_length": 47.9640625,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.5128205128205128,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.3035640716552734,
      "kl": 0.8232784837484359,
      "learning_rate": 5.914465158126556e-05,
      "loss": 0.0672,
      "num_tokens": 1197488.0,
      "reward": -63.173458862304685,
      "reward_std": 89.90511016845703,
      "rewards/wrapper/mean": -15.793364781141282,
      "rewards/wrapper/std": 38.18243933916092,
      "step": 50
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0,
      "completions/max_length": 89.4,
      "completions/max_terminated_length": 89.4,
      "completions/mean_length": 42.209375,
      "completions/mean_terminated_length": 42.209375,
      "completions/min_length": 14.2,
      "completions/min_terminated_length": 14.2,
      "epoch": 0.5641025641025641,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.7276079654693604,
      "kl": 0.9960113108158112,
      "learning_rate": 5.843303721568397e-05,
      "loss": 0.0683,
      "num_tokens": 1264850.0,
      "reward": -118.52191162109375,
      "reward_std": 96.5303970336914,
      "rewards/wrapper/mean": -29.6304764598608,
      "rewards/wrapper/std": 43.042814162373546,
      "step": 55
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0,
      "completions/max_length": 84.6,
      "completions/max_terminated_length": 84.6,
      "completions/mean_length": 34.63125,
      "completions/mean_terminated_length": 34.63125,
      "completions/min_length": 6.8,
      "completions/min_terminated_length": 6.8,
      "epoch": 0.6153846153846154,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.0093631744384766,
      "kl": 0.8994639366865158,
      "learning_rate": 5.751728454738182e-05,
      "loss": 0.0518,
      "num_tokens": 1328540.0,
      "reward": -66.9901252746582,
      "reward_std": 90.42131805419922,
      "rewards/wrapper/mean": -16.747532141208648,
      "rewards/wrapper/std": 36.916642357409,
      "step": 60
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0,
      "completions/max_length": 128.6,
      "completions/max_terminated_length": 128.6,
      "completions/mean_length": 35.8828125,
      "completions/mean_terminated_length": 35.8828125,
      "completions/min_length": 14.4,
      "completions/min_terminated_length": 14.4,
      "epoch": 0.6666666666666666,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 3.119898796081543,
      "kl": 0.9469845354557037,
      "learning_rate": 5.640632396679849e-05,
      "loss": 0.0846,
      "num_tokens": 1393415.0,
      "reward": -31.619766616821288,
      "reward_std": 77.99259414672852,
      "rewards/wrapper/mean": -7.904942592978477,
      "rewards/wrapper/std": 37.58498305678368,
      "step": 65
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0,
      "completions/max_length": 100.4,
      "completions/max_terminated_length": 100.4,
      "completions/mean_length": 36.678125,
      "completions/mean_terminated_length": 36.678125,
      "completions/min_length": 14.2,
      "completions/min_terminated_length": 14.2,
      "epoch": 0.717948717948718,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.7695233821868896,
      "kl": 1.2247124403715133,
      "learning_rate": 5.511098952562815e-05,
      "loss": 0.0823,
      "num_tokens": 1459001.0,
      "reward": -4.833992385864258,
      "reward_std": 56.544490814208984,
      "rewards/wrapper/mean": -1.2084980607032776,
      "rewards/wrapper/std": 27.944597291946412,
      "step": 70
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0,
      "completions/max_length": 97.4,
      "completions/max_terminated_length": 97.4,
      "completions/mean_length": 43.2375,
      "completions/mean_terminated_length": 43.2375,
      "completions/min_length": 13.2,
      "completions/min_terminated_length": 13.2,
      "epoch": 0.7692307692307693,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 3.9764764308929443,
      "kl": 1.2457805454730988,
      "learning_rate": 5.364391328349745e-05,
      "loss": 0.0756,
      "num_tokens": 1528715.0,
      "reward": -13.390717124938964,
      "reward_std": 69.7399917602539,
      "rewards/wrapper/mean": -3.347679337859154,
      "rewards/wrapper/std": 32.90785598009825,
      "step": 75
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0,
      "completions/max_length": 78.8,
      "completions/max_terminated_length": 78.8,
      "completions/mean_length": 36.440625,
      "completions/mean_terminated_length": 36.440625,
      "completions/min_length": 14.4,
      "completions/min_terminated_length": 14.4,
      "epoch": 0.8205128205128205,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.8895949125289917,
      "kl": 1.0609969675540925,
      "learning_rate": 5.201940212052822e-05,
      "loss": 0.0591,
      "num_tokens": 1594177.0,
      "reward": 4.946323013305664,
      "reward_std": 54.40032806396484,
      "rewards/wrapper/mean": 1.2365794837474824,
      "rewards/wrapper/std": 28.65713137835264,
      "step": 80
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0,
      "completions/max_length": 84.8,
      "completions/max_terminated_length": 84.8,
      "completions/mean_length": 38.1,
      "completions/mean_terminated_length": 38.1,
      "completions/min_length": 19.6,
      "completions/min_terminated_length": 19.6,
      "epoch": 0.8717948717948718,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.7108911275863647,
      "kl": 1.11464681327343,
      "learning_rate": 5.0253298217104604e-05,
      "loss": 0.0895,
      "num_tokens": 1659627.0,
      "reward": 7.2968450546264645,
      "reward_std": 53.64143524169922,
      "rewards/wrapper/mean": 1.8242116868495941,
      "rewards/wrapper/std": 28.17090885937214,
      "step": 85
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0,
      "completions/max_length": 65.4,
      "completions/max_terminated_length": 65.4,
      "completions/mean_length": 33.125,
      "completions/mean_terminated_length": 33.125,
      "completions/min_length": 16.8,
      "completions/min_terminated_length": 16.8,
      "epoch": 0.9230769230769231,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.109586238861084,
      "kl": 1.2486278891563416,
      "learning_rate": 4.8362824561441765e-05,
      "loss": 0.0599,
      "num_tokens": 1720977.0,
      "reward": 20.833633995056154,
      "reward_std": 44.79316711425781,
      "rewards/wrapper/mean": 5.2084078043699265,
      "rewards/wrapper/std": 24.906518502533437,
      "step": 90
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0,
      "completions/max_length": 83.8,
      "completions/max_terminated_length": 83.8,
      "completions/mean_length": 44.4625,
      "completions/mean_terminated_length": 44.4625,
      "completions/min_length": 19.8,
      "completions/min_terminated_length": 19.8,
      "epoch": 0.9743589743589743,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.6314218044281006,
      "kl": 1.2025073766708374,
      "learning_rate": 4.636641699156113e-05,
      "loss": 0.0708,
      "num_tokens": 1791235.0,
      "reward": -30.517698287963867,
      "reward_std": 65.50241241455078,
      "rewards/wrapper/mean": -7.629425781965256,
      "rewards/wrapper/std": 36.861526291072366,
      "step": 95
    },
    {
      "epoch": 0.9948717948717949,
      "eval_clip_ratio/high_max": 0.0,
      "eval_clip_ratio/high_mean": 0.0,
      "eval_clip_ratio/low_mean": 0.0,
      "eval_clip_ratio/low_min": 0.0,
      "eval_clip_ratio/region_mean": 0.0,
      "eval_completions/clipped_ratio": -1.0,
      "eval_completions/max_length": 92.12,
      "eval_completions/max_terminated_length": 92.12,
      "eval_completions/mean_length": 63.9175,
      "eval_completions/mean_terminated_length": 63.9175,
      "eval_completions/min_length": 38.64,
      "eval_completions/min_terminated_length": 38.64,
      "eval_frac_reward_zero_std": 0.0,
      "eval_kl": 1.1354684853553771,
      "eval_loss": 0.10735327005386353,
      "eval_num_tokens": 1819685.0,
      "eval_reward": -66.02375819206237,
      "eval_reward_std": 83.30394130706787,
      "eval_rewards/wrapper/mean": -16.50594002187252,
      "eval_rewards/wrapper/std": 38.53076554499567,
      "eval_runtime": 43.7098,
      "eval_samples_per_second": 4.576,
      "eval_steps_per_second": 0.572,
      "step": 97
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0,
      "completions/max_length": 115.6,
      "completions/max_terminated_length": 115.6,
      "completions/mean_length": 59.56875,
      "completions/mean_terminated_length": 59.56875,
      "completions/min_length": 15.4,
      "completions/min_terminated_length": 15.4,
      "epoch": 1.0307692307692307,
      "frac_reward_zero_std": 0.003125,
      "grad_norm": 1.237181305885315,
      "kl": 1.4063826143741607,
      "learning_rate": 4.4283544409593574e-05,
      "loss": 0.101,
      "num_tokens": 1871031.0,
      "reward": -60.77660579681397,
      "reward_std": 79.36799545288086,
      "rewards/wrapper/mean": -15.194152063131332,
      "rewards/wrapper/std": 36.98495513498783,
      "step": 100
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0,
      "completions/max_length": 93.4,
      "completions/max_terminated_length": 93.4,
      "completions/mean_length": 46.6171875,
      "completions/mean_terminated_length": 46.6171875,
      "completions/min_length": 8.6,
      "completions/min_terminated_length": 8.6,
      "epoch": 1.082051282051282,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.296970009803772,
      "kl": 1.066127210855484,
      "learning_rate": 4.2134518921674714e-05,
      "loss": 0.0539,
      "num_tokens": 1942044.0,
      "reward": -22.646591567993163,
      "reward_std": 68.25165405273438,
      "rewards/wrapper/mean": -5.661647778749466,
      "rewards/wrapper/std": 28.533076685667037,
      "step": 105
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0,
      "completions/max_length": 77.6,
      "completions/max_terminated_length": 77.6,
      "completions/mean_length": 42.459375,
      "completions/mean_terminated_length": 42.459375,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 1.1333333333333333,
      "frac_reward_zero_std": 0.003125,
      "grad_norm": 1.8483984470367432,
      "kl": 1.0357002288103103,
      "learning_rate": 3.9940297754942075e-05,
      "loss": 0.0516,
      "num_tokens": 2010596.0,
      "reward": 0.9197359561920166,
      "reward_std": 55.6744888305664,
      "rewards/wrapper/mean": 0.22993419468402862,
      "rewards/wrapper/std": 25.568822310864924,
      "step": 110
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0,
      "completions/max_length": 113.0,
      "completions/max_terminated_length": 113.0,
      "completions/mean_length": 54.165625,
      "completions/mean_terminated_length": 54.165625,
      "completions/min_length": 18.8,
      "completions/min_terminated_length": 18.8,
      "epoch": 1.1846153846153846,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.6020848751068115,
      "kl": 1.0221260011196136,
      "learning_rate": 3.7722278883332974e-05,
      "loss": 0.1395,
      "num_tokens": 2087050.0,
      "reward": -30.523137283325195,
      "reward_std": 77.01765441894531,
      "rewards/wrapper/mean": -7.630784884095192,
      "rewards/wrapper/std": 36.765226520597935,
      "step": 115
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0,
      "completions/max_length": 82.0,
      "completions/max_terminated_length": 82.0,
      "completions/mean_length": 38.0828125,
      "completions/mean_terminated_length": 38.0828125,
      "completions/min_length": 16.6,
      "completions/min_terminated_length": 16.6,
      "epoch": 1.235897435897436,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.0064072608947754,
      "kl": 1.0408909171819687,
      "learning_rate": 3.550209235523397e-05,
      "loss": 0.0513,
      "num_tokens": 2152491.0,
      "reward": 22.795025253295897,
      "reward_std": 41.94969177246094,
      "rewards/wrapper/mean": 5.698757213354111,
      "rewards/wrapper/std": 19.809213139116764,
      "step": 120
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0,
      "completions/max_length": 107.8,
      "completions/max_terminated_length": 107.8,
      "completions/mean_length": 46.9953125,
      "completions/mean_terminated_length": 46.9953125,
      "completions/min_length": 22.4,
      "completions/min_terminated_length": 22.4,
      "epoch": 1.287179487179487,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 3.0431101322174072,
      "kl": 1.2138669222593308,
      "learning_rate": 3.330138935794765e-05,
      "loss": 0.1058,
      "num_tokens": 2224424.0,
      "reward": -19.105009269714355,
      "reward_std": 62.34371109008789,
      "rewards/wrapper/mean": -4.776251962780952,
      "rewards/wrapper/std": 31.607597357034685,
      "step": 125
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0,
      "completions/max_length": 66.0,
      "completions/max_terminated_length": 66.0,
      "completions/mean_length": 34.2125,
      "completions/mean_terminated_length": 34.2125,
      "completions/min_length": 10.8,
      "completions/min_terminated_length": 10.8,
      "epoch": 1.3384615384615386,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.2422068119049072,
      "kl": 1.209676158428192,
      "learning_rate": 3.114163107601353e-05,
      "loss": 0.0636,
      "num_tokens": 2287262.0,
      "reward": 32.78384780883789,
      "reward_std": 38.527008056640625,
      "rewards/wrapper/mean": 8.195961704850196,
      "rewards/wrapper/std": 19.960034711658956,
      "step": 130
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0,
      "completions/max_length": 79.6,
      "completions/max_terminated_length": 79.6,
      "completions/mean_length": 34.575,
      "completions/mean_terminated_length": 34.575,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 1.3897435897435897,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.8982096910476685,
      "kl": 1.0722932010889052,
      "learning_rate": 2.9043879402429647e-05,
      "loss": 0.0965,
      "num_tokens": 2350846.0,
      "reward": 30.75788803100586,
      "reward_std": 43.10101852416992,
      "rewards/wrapper/mean": 7.689471507072449,
      "rewards/wrapper/std": 21.09446730464697,
      "step": 135
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0,
      "completions/max_length": 72.4,
      "completions/max_terminated_length": 72.4,
      "completions/mean_length": 34.4046875,
      "completions/mean_terminated_length": 34.4046875,
      "completions/min_length": 19.2,
      "completions/min_terminated_length": 19.2,
      "epoch": 1.441025641025641,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.007930278778076,
      "kl": 1.1486232221126556,
      "learning_rate": 2.7028591543752427e-05,
      "loss": 0.0813,
      "num_tokens": 2415573.0,
      "reward": 35.33515167236328,
      "reward_std": 37.335984802246095,
      "rewards/wrapper/mean": 8.833788806200028,
      "rewards/wrapper/std": 20.25660429894924,
      "step": 140
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0,
      "completions/max_length": 61.2,
      "completions/max_terminated_length": 61.2,
      "completions/mean_length": 29.40625,
      "completions/mean_terminated_length": 29.40625,
      "completions/min_length": 14.6,
      "completions/min_terminated_length": 14.6,
      "epoch": 1.4923076923076923,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.8863755464553833,
      "kl": 1.1906954646110535,
      "learning_rate": 2.511542052207919e-05,
      "loss": 0.0648,
      "num_tokens": 2477523.0,
      "reward": 38.670701599121095,
      "reward_std": 33.14458961486817,
      "rewards/wrapper/mean": 9.667675691843034,
      "rewards/wrapper/std": 15.559051163494587,
      "step": 145
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0,
      "completions/max_length": 80.8,
      "completions/max_terminated_length": 80.8,
      "completions/mean_length": 35.03125,
      "completions/mean_terminated_length": 35.03125,
      "completions/min_length": 18.6,
      "completions/min_terminated_length": 18.6,
      "epoch": 1.5435897435897434,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.7253738641738892,
      "kl": 1.1878035724163056,
      "learning_rate": 2.3323023519411503e-05,
      "loss": 0.1154,
      "num_tokens": 2541265.0,
      "reward": 27.669879531860353,
      "reward_std": 43.78490676879883,
      "rewards/wrapper/mean": 6.917469450831414,
      "rewards/wrapper/std": 23.019567246735097,
      "step": 150
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0,
      "completions/max_length": 72.4,
      "completions/max_terminated_length": 72.4,
      "completions/mean_length": 31.628125,
      "completions/mean_terminated_length": 31.628125,
      "completions/min_length": 19.6,
      "completions/min_terminated_length": 19.6,
      "epoch": 1.594871794871795,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 4.564350605010986,
      "kl": 1.3340431094169616,
      "learning_rate": 2.1668879933418993e-05,
      "loss": 0.1029,
      "num_tokens": 2602789.0,
      "reward": 39.08445358276367,
      "reward_std": 42.32896957397461,
      "rewards/wrapper/mean": 9.771112731099128,
      "rewards/wrapper/std": 20.528056579828263,
      "step": 155
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0,
      "completions/max_length": 76.8,
      "completions/max_terminated_length": 76.8,
      "completions/mean_length": 29.2515625,
      "completions/mean_terminated_length": 29.2515625,
      "completions/min_length": 18.6,
      "completions/min_terminated_length": 18.6,
      "epoch": 1.646153846153846,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.9051811695098877,
      "kl": 1.2881734311580657,
      "learning_rate": 2.0169120918917907e-05,
      "loss": 0.0809,
      "num_tokens": 2663192.0,
      "reward": 52.69954376220703,
      "reward_std": 32.72202682495117,
      "rewards/wrapper/mean": 13.174886631965638,
      "rewards/wrapper/std": 16.570664164423942,
      "step": 160
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0,
      "completions/max_length": 70.8,
      "completions/max_terminated_length": 70.8,
      "completions/mean_length": 31.2953125,
      "completions/mean_terminated_length": 31.2953125,
      "completions/min_length": 20.8,
      "completions/min_terminated_length": 20.8,
      "epoch": 1.6974358974358974,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.7347543239593506,
      "kl": 1.1357142448425293,
      "learning_rate": 1.8838372077370462e-05,
      "loss": 0.0823,
      "num_tokens": 2724121.0,
      "reward": 37.44524154663086,
      "reward_std": 37.703224182128906,
      "rewards/wrapper/mean": 9.361309441924096,
      "rewards/wrapper/std": 20.84148458391428,
      "step": 165
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0,
      "completions/max_length": 54.4,
      "completions/max_terminated_length": 54.4,
      "completions/mean_length": 28.6734375,
      "completions/mean_terminated_length": 28.6734375,
      "completions/min_length": 18.2,
      "completions/min_terminated_length": 18.2,
      "epoch": 1.7487179487179487,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.6308436393737793,
      "kl": 1.1717410743236543,
      "learning_rate": 1.7689610828491808e-05,
      "loss": 0.0589,
      "num_tokens": 2784650.0,
      "reward": 58.644406127929685,
      "reward_std": 29.47605781555176,
      "rewards/wrapper/mean": 14.661102002859115,
      "rewards/wrapper/std": 14.715145578980446,
      "step": 170
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0,
      "completions/max_length": 65.8,
      "completions/max_terminated_length": 65.8,
      "completions/mean_length": 29.45625,
      "completions/mean_terminated_length": 29.45625,
      "completions/min_length": 17.6,
      "completions/min_terminated_length": 17.6,
      "epoch": 1.8,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.117802858352661,
      "kl": 1.2103615283966065,
      "learning_rate": 1.67340398548722e-05,
      "loss": 0.0726,
      "num_tokens": 2844024.0,
      "reward": 59.70240249633789,
      "reward_std": 27.98085708618164,
      "rewards/wrapper/mean": 14.925600388646126,
      "rewards/wrapper/std": 15.45614178404212,
      "step": 175
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0,
      "completions/max_length": 60.0,
      "completions/max_terminated_length": 60.0,
      "completions/mean_length": 29.1984375,
      "completions/mean_terminated_length": 29.1984375,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 1.8512820512820514,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.60492742061615,
      "kl": 1.2078358113765717,
      "learning_rate": 1.5980977853778245e-05,
      "loss": 0.0608,
      "num_tokens": 2903601.0,
      "reward": 63.7161376953125,
      "reward_std": 28.505185317993163,
      "rewards/wrapper/mean": 15.929034775495529,
      "rewards/wrapper/std": 14.41244371831417,
      "step": 180
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0,
      "completions/max_length": 55.0,
      "completions/max_terminated_length": 55.0,
      "completions/mean_length": 30.325,
      "completions/mean_terminated_length": 30.325,
      "completions/min_length": 20.6,
      "completions/min_terminated_length": 20.6,
      "epoch": 1.9025641025641025,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.4518439769744873,
      "kl": 1.1801301002502442,
      "learning_rate": 1.5437768661518216e-05,
      "loss": 0.0657,
      "num_tokens": 2964755.0,
      "reward": 58.35861740112305,
      "reward_std": 29.095483779907227,
      "rewards/wrapper/mean": 14.589654579758644,
      "rewards/wrapper/std": 15.775352307409047,
      "step": 185
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0,
      "completions/max_length": 52.8,
      "completions/max_terminated_length": 52.8,
      "completions/mean_length": 29.8828125,
      "completions/mean_terminated_length": 29.8828125,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 1.953846153846154,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.6311966180801392,
      "kl": 1.179757869243622,
      "learning_rate": 1.5109709636587557e-05,
      "loss": 0.0579,
      "num_tokens": 3025992.0,
      "reward": 66.08211746215821,
      "reward_std": 29.545513153076172,
      "rewards/wrapper/mean": 16.520529848337173,
      "rewards/wrapper/std": 14.267796693742275,
      "step": 190
    },
    {
      "epoch": 1.994871794871795,
      "eval_clip_ratio/high_max": 0.0,
      "eval_clip_ratio/high_mean": 0.0,
      "eval_clip_ratio/low_mean": 0.0,
      "eval_clip_ratio/low_min": 0.0,
      "eval_clip_ratio/region_mean": 0.0,
      "eval_completions/clipped_ratio": -1.0,
      "eval_completions/max_length": 39.62,
      "eval_completions/max_terminated_length": 39.62,
      "eval_completions/mean_length": 29.4125,
      "eval_completions/mean_terminated_length": 29.4125,
      "eval_completions/min_length": 22.36,
      "eval_completions/min_terminated_length": 22.36,
      "eval_frac_reward_zero_std": 0.005,
      "eval_kl": 1.3025571525096893,
      "eval_loss": 0.06812583655118942,
      "eval_num_tokens": 3075371.0,
      "eval_reward": 71.65954360961913,
      "eval_reward_std": 28.824577026367187,
      "eval_rewards/wrapper/mean": 17.914886049479247,
      "eval_rewards/wrapper/std": 13.78032234594226,
      "eval_runtime": 23.7513,
      "eval_samples_per_second": 8.421,
      "eval_steps_per_second": 1.053,
      "step": 194
    }
  ],
  "logging_steps": 5,
  "max_steps": 194,
  "num_input_tokens_seen": 3075371,
  "num_train_epochs": 2,
  "save_steps": 200.0,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 16,
  "trial_name": null,
  "trial_params": null
}