{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.994871794871795, "eval_steps": 500, "global_step": 194, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.428125, "completions/max_length": 256.0, "completions/max_terminated_length": 249.8, "completions/mean_length": 231.165625, "completions/mean_terminated_length": 170.26304931640624, "completions/min_length": 32.4, "completions/min_terminated_length": 32.4, "epoch": 0.05128205128205128, "frac_reward_zero_std": 0.0, "grad_norm": 1.7885048389434814, "kl": 0.0009005711530335247, "learning_rate": 6.857142857142857e-06, "loss": 0.0433, "num_tokens": 190516.0, "reward": -3022.819580078125, "reward_std": 900.1877319335938, "rewards/wrapper/mean": -755.704880669713, "rewards/wrapper/std": 323.2347017750144, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 216.6640625, "completions/mean_terminated_length": 138.70114440917968, "completions/min_length": 9.2, "completions/min_terminated_length": 9.2, "epoch": 0.10256410256410256, "frac_reward_zero_std": 0.0, "grad_norm": 17.930580139160156, "kl": 0.5607756731566041, "learning_rate": 1.5428571428571428e-05, "loss": 0.102, "num_tokens": 370555.0, "reward": -2350.24609375, "reward_std": 1036.3879638671874, "rewards/wrapper/mean": -587.5615524947643, "rewards/wrapper/std": 343.1748051315546, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.31875, "completions/max_length": 256.0, "completions/max_terminated_length": 253.2, "completions/mean_length": 215.7828125, "completions/mean_terminated_length": 137.66307067871094, "completions/min_length": 10.6, "completions/min_terminated_length": 10.6, "epoch": 0.15384615384615385, "frac_reward_zero_std": 0.0, "grad_norm": 1.9885616302490234, "kl": 0.1523141123354435, "learning_rate": 2.4e-05, "loss": 0.087, "num_tokens": 551026.0, "reward": -1799.3174072265624, "reward_std": 753.4510498046875, "rewards/wrapper/mean": -449.82934576869013, "rewards/wrapper/std": 268.9517418012023, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01875, "completions/max_length": 256.0, "completions/max_terminated_length": 253.2, "completions/mean_length": 192.778125, "completions/mean_terminated_length": 127.8021957397461, "completions/min_length": 4.8, "completions/min_terminated_length": 4.8, "epoch": 0.20512820512820512, "frac_reward_zero_std": 0.0, "grad_norm": 10.276989936828613, "kl": 0.28207572996616365, "learning_rate": 3.2571428571428566e-05, "loss": 0.15, "num_tokens": 714888.0, "reward": -1380.6512451171875, "reward_std": 699.9352905273438, "rewards/wrapper/mean": -345.1628050208092, "rewards/wrapper/std": 243.51311295181512, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.825, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 97.496875, "completions/mean_terminated_length": 84.21928253173829, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.2564102564102564, "frac_reward_zero_std": 0.025, "grad_norm": 951.533935546875, "kl": 2.6448661953210832, "learning_rate": 4.1142857142857146e-05, "loss": 0.2845, "num_tokens": 820914.0, "reward": -583.757763671875, "reward_std": 387.18121032714845, "rewards/wrapper/mean": -145.93943485021592, "rewards/wrapper/std": 161.7167808920145, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "completions/max_length": 253.8, "completions/max_terminated_length": 205.2, "completions/mean_length": 49.8390625, "completions/mean_terminated_length": 48.195822143554686, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.3076923076923077, "frac_reward_zero_std": 0.034375, "grad_norm": 37.3953742980957, "kl": 1.2820908799767494, "learning_rate": 4.971428571428572e-05, "loss": 0.097, "num_tokens": 895293.0, "reward": -340.1655334472656, "reward_std": 238.6476287841797, "rewards/wrapper/mean": -85.0413837403059, "rewards/wrapper/std": 130.81275693774222, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.99375, "completions/max_length": 226.8, "completions/max_terminated_length": 201.8, "completions/mean_length": 52.6359375, "completions/mean_terminated_length": 51.99286422729492, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.358974358974359, "frac_reward_zero_std": 0.003125, "grad_norm": 3.4364609718322754, "kl": 0.8841245293617248, "learning_rate": 5.8285714285714284e-05, "loss": 0.1376, "num_tokens": 969890.0, "reward": -330.9278930664062, "reward_std": 251.6781036376953, "rewards/wrapper/mean": -82.73197320103645, "rewards/wrapper/std": 117.19065331816674, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.99375, "completions/max_length": 185.0, "completions/max_terminated_length": 179.8, "completions/mean_length": 56.6375, "completions/mean_terminated_length": 56.020660400390625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.41025641025641024, "frac_reward_zero_std": 0.0, "grad_norm": 2.0956766605377197, "kl": 0.6315083980560303, "learning_rate": 5.992976526832631e-05, "loss": 0.1093, "num_tokens": 1048146.0, "reward": -239.1210479736328, "reward_std": 183.56537475585938, "rewards/wrapper/mean": -59.78026267290115, "rewards/wrapper/std": 73.05939166247845, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 56.0390625, "completions/mean_terminated_length": 56.0390625, "completions/min_length": 5.2, "completions/min_terminated_length": 5.2, "epoch": 0.46153846153846156, "frac_reward_zero_std": 0.0, "grad_norm": 2.076198101043701, "kl": 2.5414538830518723, "learning_rate": 5.9645188003820935e-05, "loss": 0.1353, "num_tokens": 1125149.0, "reward": -128.92584991455078, "reward_std": 117.70238952636718, "rewards/wrapper/mean": -32.23146015405655, "rewards/wrapper/std": 53.83595065623522, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 126.8, "completions/max_terminated_length": 126.8, "completions/mean_length": 47.9640625, "completions/mean_terminated_length": 47.9640625, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.5128205128205128, "frac_reward_zero_std": 0.0, "grad_norm": 2.3035640716552734, "kl": 0.8232784837484359, "learning_rate": 5.914465158126556e-05, "loss": 0.0672, "num_tokens": 1197488.0, "reward": -63.173458862304685, "reward_std": 89.90511016845703, "rewards/wrapper/mean": -15.793364781141282, "rewards/wrapper/std": 38.18243933916092, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 89.4, "completions/max_terminated_length": 89.4, "completions/mean_length": 42.209375, "completions/mean_terminated_length": 42.209375, "completions/min_length": 14.2, "completions/min_terminated_length": 14.2, "epoch": 0.5641025641025641, "frac_reward_zero_std": 0.0, "grad_norm": 1.7276079654693604, "kl": 0.9960113108158112, "learning_rate": 5.843303721568397e-05, "loss": 0.0683, "num_tokens": 1264850.0, "reward": -118.52191162109375, "reward_std": 96.5303970336914, "rewards/wrapper/mean": -29.6304764598608, "rewards/wrapper/std": 43.042814162373546, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 84.6, "completions/max_terminated_length": 84.6, "completions/mean_length": 34.63125, "completions/mean_terminated_length": 34.63125, "completions/min_length": 6.8, "completions/min_terminated_length": 6.8, "epoch": 0.6153846153846154, "frac_reward_zero_std": 0.0, "grad_norm": 2.0093631744384766, "kl": 0.8994639366865158, "learning_rate": 5.751728454738182e-05, "loss": 0.0518, "num_tokens": 1328540.0, "reward": -66.9901252746582, "reward_std": 90.42131805419922, "rewards/wrapper/mean": -16.747532141208648, "rewards/wrapper/std": 36.916642357409, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 128.6, "completions/max_terminated_length": 128.6, "completions/mean_length": 35.8828125, "completions/mean_terminated_length": 35.8828125, "completions/min_length": 14.4, "completions/min_terminated_length": 14.4, "epoch": 0.6666666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 3.119898796081543, "kl": 0.9469845354557037, "learning_rate": 5.640632396679849e-05, "loss": 0.0846, "num_tokens": 1393415.0, "reward": -31.619766616821288, "reward_std": 77.99259414672852, "rewards/wrapper/mean": -7.904942592978477, "rewards/wrapper/std": 37.58498305678368, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 100.4, "completions/max_terminated_length": 100.4, "completions/mean_length": 36.678125, "completions/mean_terminated_length": 36.678125, "completions/min_length": 14.2, "completions/min_terminated_length": 14.2, "epoch": 0.717948717948718, "frac_reward_zero_std": 0.0, "grad_norm": 2.7695233821868896, "kl": 1.2247124403715133, "learning_rate": 5.511098952562815e-05, "loss": 0.0823, "num_tokens": 1459001.0, "reward": -4.833992385864258, "reward_std": 56.544490814208984, "rewards/wrapper/mean": -1.2084980607032776, "rewards/wrapper/std": 27.944597291946412, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 97.4, "completions/max_terminated_length": 97.4, "completions/mean_length": 43.2375, "completions/mean_terminated_length": 43.2375, "completions/min_length": 13.2, "completions/min_terminated_length": 13.2, "epoch": 0.7692307692307693, "frac_reward_zero_std": 0.0, "grad_norm": 3.9764764308929443, "kl": 1.2457805454730988, "learning_rate": 5.364391328349745e-05, "loss": 0.0756, "num_tokens": 1528715.0, "reward": -13.390717124938964, "reward_std": 69.7399917602539, "rewards/wrapper/mean": -3.347679337859154, "rewards/wrapper/std": 32.90785598009825, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 78.8, "completions/max_terminated_length": 78.8, "completions/mean_length": 36.440625, "completions/mean_terminated_length": 36.440625, "completions/min_length": 14.4, "completions/min_terminated_length": 14.4, "epoch": 0.8205128205128205, "frac_reward_zero_std": 0.0, "grad_norm": 1.8895949125289917, "kl": 1.0609969675540925, "learning_rate": 5.201940212052822e-05, "loss": 0.0591, "num_tokens": 1594177.0, "reward": 4.946323013305664, "reward_std": 54.40032806396484, "rewards/wrapper/mean": 1.2365794837474824, "rewards/wrapper/std": 28.65713137835264, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 84.8, "completions/max_terminated_length": 84.8, "completions/mean_length": 38.1, "completions/mean_terminated_length": 38.1, "completions/min_length": 19.6, "completions/min_terminated_length": 19.6, "epoch": 0.8717948717948718, "frac_reward_zero_std": 0.0, "grad_norm": 1.7108911275863647, "kl": 1.11464681327343, "learning_rate": 5.0253298217104604e-05, "loss": 0.0895, "num_tokens": 1659627.0, "reward": 7.2968450546264645, "reward_std": 53.64143524169922, "rewards/wrapper/mean": 1.8242116868495941, "rewards/wrapper/std": 28.17090885937214, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 65.4, "completions/max_terminated_length": 65.4, "completions/mean_length": 33.125, "completions/mean_terminated_length": 33.125, "completions/min_length": 16.8, "completions/min_terminated_length": 16.8, "epoch": 0.9230769230769231, "frac_reward_zero_std": 0.0, "grad_norm": 2.109586238861084, "kl": 1.2486278891563416, "learning_rate": 4.8362824561441765e-05, "loss": 0.0599, "num_tokens": 1720977.0, "reward": 20.833633995056154, "reward_std": 44.79316711425781, "rewards/wrapper/mean": 5.2084078043699265, "rewards/wrapper/std": 24.906518502533437, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 83.8, "completions/max_terminated_length": 83.8, "completions/mean_length": 44.4625, "completions/mean_terminated_length": 44.4625, "completions/min_length": 19.8, "completions/min_terminated_length": 19.8, "epoch": 0.9743589743589743, "frac_reward_zero_std": 0.0, "grad_norm": 1.6314218044281006, "kl": 1.2025073766708374, "learning_rate": 4.636641699156113e-05, "loss": 0.0708, "num_tokens": 1791235.0, "reward": -30.517698287963867, "reward_std": 65.50241241455078, "rewards/wrapper/mean": -7.629425781965256, "rewards/wrapper/std": 36.861526291072366, "step": 95 }, { "epoch": 0.9948717948717949, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": -1.0, "eval_completions/max_length": 92.12, "eval_completions/max_terminated_length": 92.12, "eval_completions/mean_length": 63.9175, "eval_completions/mean_terminated_length": 63.9175, "eval_completions/min_length": 38.64, "eval_completions/min_terminated_length": 38.64, "eval_frac_reward_zero_std": 0.0, "eval_kl": 1.1354684853553771, "eval_loss": 0.10735327005386353, "eval_num_tokens": 1819685.0, "eval_reward": -66.02375819206237, "eval_reward_std": 83.30394130706787, "eval_rewards/wrapper/mean": -16.50594002187252, "eval_rewards/wrapper/std": 38.53076554499567, "eval_runtime": 43.7098, "eval_samples_per_second": 4.576, "eval_steps_per_second": 0.572, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 115.6, "completions/max_terminated_length": 115.6, "completions/mean_length": 59.56875, "completions/mean_terminated_length": 59.56875, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 1.0307692307692307, "frac_reward_zero_std": 0.003125, "grad_norm": 1.237181305885315, "kl": 1.4063826143741607, "learning_rate": 4.4283544409593574e-05, "loss": 0.101, "num_tokens": 1871031.0, "reward": -60.77660579681397, "reward_std": 79.36799545288086, "rewards/wrapper/mean": -15.194152063131332, "rewards/wrapper/std": 36.98495513498783, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 93.4, "completions/max_terminated_length": 93.4, "completions/mean_length": 46.6171875, "completions/mean_terminated_length": 46.6171875, "completions/min_length": 8.6, "completions/min_terminated_length": 8.6, "epoch": 1.082051282051282, "frac_reward_zero_std": 0.0, "grad_norm": 1.296970009803772, "kl": 1.066127210855484, "learning_rate": 4.2134518921674714e-05, "loss": 0.0539, "num_tokens": 1942044.0, "reward": -22.646591567993163, "reward_std": 68.25165405273438, "rewards/wrapper/mean": -5.661647778749466, "rewards/wrapper/std": 28.533076685667037, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 77.6, "completions/max_terminated_length": 77.6, "completions/mean_length": 42.459375, "completions/mean_terminated_length": 42.459375, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 1.1333333333333333, "frac_reward_zero_std": 0.003125, "grad_norm": 1.8483984470367432, "kl": 1.0357002288103103, "learning_rate": 3.9940297754942075e-05, "loss": 0.0516, "num_tokens": 2010596.0, "reward": 0.9197359561920166, "reward_std": 55.6744888305664, "rewards/wrapper/mean": 0.22993419468402862, "rewards/wrapper/std": 25.568822310864924, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 113.0, "completions/max_terminated_length": 113.0, "completions/mean_length": 54.165625, "completions/mean_terminated_length": 54.165625, "completions/min_length": 18.8, "completions/min_terminated_length": 18.8, "epoch": 1.1846153846153846, "frac_reward_zero_std": 0.0, "grad_norm": 1.6020848751068115, "kl": 1.0221260011196136, "learning_rate": 3.7722278883332974e-05, "loss": 0.1395, "num_tokens": 2087050.0, "reward": -30.523137283325195, "reward_std": 77.01765441894531, "rewards/wrapper/mean": -7.630784884095192, "rewards/wrapper/std": 36.765226520597935, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 82.0, "completions/max_terminated_length": 82.0, "completions/mean_length": 38.0828125, "completions/mean_terminated_length": 38.0828125, "completions/min_length": 16.6, "completions/min_terminated_length": 16.6, "epoch": 1.235897435897436, "frac_reward_zero_std": 0.0, "grad_norm": 2.0064072608947754, "kl": 1.0408909171819687, "learning_rate": 3.550209235523397e-05, "loss": 0.0513, "num_tokens": 2152491.0, "reward": 22.795025253295897, "reward_std": 41.94969177246094, "rewards/wrapper/mean": 5.698757213354111, "rewards/wrapper/std": 19.809213139116764, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 107.8, "completions/max_terminated_length": 107.8, "completions/mean_length": 46.9953125, "completions/mean_terminated_length": 46.9953125, "completions/min_length": 22.4, "completions/min_terminated_length": 22.4, "epoch": 1.287179487179487, "frac_reward_zero_std": 0.0, "grad_norm": 3.0431101322174072, "kl": 1.2138669222593308, "learning_rate": 3.330138935794765e-05, "loss": 0.1058, "num_tokens": 2224424.0, "reward": -19.105009269714355, "reward_std": 62.34371109008789, "rewards/wrapper/mean": -4.776251962780952, "rewards/wrapper/std": 31.607597357034685, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 34.2125, "completions/mean_terminated_length": 34.2125, "completions/min_length": 10.8, "completions/min_terminated_length": 10.8, "epoch": 1.3384615384615386, "frac_reward_zero_std": 0.0, "grad_norm": 2.2422068119049072, "kl": 1.209676158428192, "learning_rate": 3.114163107601353e-05, "loss": 0.0636, "num_tokens": 2287262.0, "reward": 32.78384780883789, "reward_std": 38.527008056640625, "rewards/wrapper/mean": 8.195961704850196, "rewards/wrapper/std": 19.960034711658956, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 79.6, "completions/max_terminated_length": 79.6, "completions/mean_length": 34.575, "completions/mean_terminated_length": 34.575, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 1.3897435897435897, "frac_reward_zero_std": 0.0, "grad_norm": 1.8982096910476685, "kl": 1.0722932010889052, "learning_rate": 2.9043879402429647e-05, "loss": 0.0965, "num_tokens": 2350846.0, "reward": 30.75788803100586, "reward_std": 43.10101852416992, "rewards/wrapper/mean": 7.689471507072449, "rewards/wrapper/std": 21.09446730464697, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 72.4, "completions/max_terminated_length": 72.4, "completions/mean_length": 34.4046875, "completions/mean_terminated_length": 34.4046875, "completions/min_length": 19.2, "completions/min_terminated_length": 19.2, "epoch": 1.441025641025641, "frac_reward_zero_std": 0.0, "grad_norm": 2.007930278778076, "kl": 1.1486232221126556, "learning_rate": 2.7028591543752427e-05, "loss": 0.0813, "num_tokens": 2415573.0, "reward": 35.33515167236328, "reward_std": 37.335984802246095, "rewards/wrapper/mean": 8.833788806200028, "rewards/wrapper/std": 20.25660429894924, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 61.2, "completions/max_terminated_length": 61.2, "completions/mean_length": 29.40625, "completions/mean_terminated_length": 29.40625, "completions/min_length": 14.6, "completions/min_terminated_length": 14.6, "epoch": 1.4923076923076923, "frac_reward_zero_std": 0.0, "grad_norm": 1.8863755464553833, "kl": 1.1906954646110535, "learning_rate": 2.511542052207919e-05, "loss": 0.0648, "num_tokens": 2477523.0, "reward": 38.670701599121095, "reward_std": 33.14458961486817, "rewards/wrapper/mean": 9.667675691843034, "rewards/wrapper/std": 15.559051163494587, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 80.8, "completions/max_terminated_length": 80.8, "completions/mean_length": 35.03125, "completions/mean_terminated_length": 35.03125, "completions/min_length": 18.6, "completions/min_terminated_length": 18.6, "epoch": 1.5435897435897434, "frac_reward_zero_std": 0.0, "grad_norm": 1.7253738641738892, "kl": 1.1878035724163056, "learning_rate": 2.3323023519411503e-05, "loss": 0.1154, "num_tokens": 2541265.0, "reward": 27.669879531860353, "reward_std": 43.78490676879883, "rewards/wrapper/mean": 6.917469450831414, "rewards/wrapper/std": 23.019567246735097, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 72.4, "completions/max_terminated_length": 72.4, "completions/mean_length": 31.628125, "completions/mean_terminated_length": 31.628125, "completions/min_length": 19.6, "completions/min_terminated_length": 19.6, "epoch": 1.594871794871795, "frac_reward_zero_std": 0.0, "grad_norm": 4.564350605010986, "kl": 1.3340431094169616, "learning_rate": 2.1668879933418993e-05, "loss": 0.1029, "num_tokens": 2602789.0, "reward": 39.08445358276367, "reward_std": 42.32896957397461, "rewards/wrapper/mean": 9.771112731099128, "rewards/wrapper/std": 20.528056579828263, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 76.8, "completions/max_terminated_length": 76.8, "completions/mean_length": 29.2515625, "completions/mean_terminated_length": 29.2515625, "completions/min_length": 18.6, "completions/min_terminated_length": 18.6, "epoch": 1.646153846153846, "frac_reward_zero_std": 0.0, "grad_norm": 1.9051811695098877, "kl": 1.2881734311580657, "learning_rate": 2.0169120918917907e-05, "loss": 0.0809, "num_tokens": 2663192.0, "reward": 52.69954376220703, "reward_std": 32.72202682495117, "rewards/wrapper/mean": 13.174886631965638, "rewards/wrapper/std": 16.570664164423942, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 70.8, "completions/max_terminated_length": 70.8, "completions/mean_length": 31.2953125, "completions/mean_terminated_length": 31.2953125, "completions/min_length": 20.8, "completions/min_terminated_length": 20.8, "epoch": 1.6974358974358974, "frac_reward_zero_std": 0.0, "grad_norm": 1.7347543239593506, "kl": 1.1357142448425293, "learning_rate": 1.8838372077370462e-05, "loss": 0.0823, "num_tokens": 2724121.0, "reward": 37.44524154663086, "reward_std": 37.703224182128906, "rewards/wrapper/mean": 9.361309441924096, "rewards/wrapper/std": 20.84148458391428, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 54.4, "completions/max_terminated_length": 54.4, "completions/mean_length": 28.6734375, "completions/mean_terminated_length": 28.6734375, "completions/min_length": 18.2, "completions/min_terminated_length": 18.2, "epoch": 1.7487179487179487, "frac_reward_zero_std": 0.0, "grad_norm": 1.6308436393737793, "kl": 1.1717410743236543, "learning_rate": 1.7689610828491808e-05, "loss": 0.0589, "num_tokens": 2784650.0, "reward": 58.644406127929685, "reward_std": 29.47605781555176, "rewards/wrapper/mean": 14.661102002859115, "rewards/wrapper/std": 14.715145578980446, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 65.8, "completions/max_terminated_length": 65.8, "completions/mean_length": 29.45625, "completions/mean_terminated_length": 29.45625, "completions/min_length": 17.6, "completions/min_terminated_length": 17.6, "epoch": 1.8, "frac_reward_zero_std": 0.0, "grad_norm": 2.117802858352661, "kl": 1.2103615283966065, "learning_rate": 1.67340398548722e-05, "loss": 0.0726, "num_tokens": 2844024.0, "reward": 59.70240249633789, "reward_std": 27.98085708618164, "rewards/wrapper/mean": 14.925600388646126, "rewards/wrapper/std": 15.45614178404212, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 29.1984375, "completions/mean_terminated_length": 29.1984375, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 1.8512820512820514, "frac_reward_zero_std": 0.0, "grad_norm": 1.60492742061615, "kl": 1.2078358113765717, "learning_rate": 1.5980977853778245e-05, "loss": 0.0608, "num_tokens": 2903601.0, "reward": 63.7161376953125, "reward_std": 28.505185317993163, "rewards/wrapper/mean": 15.929034775495529, "rewards/wrapper/std": 14.41244371831417, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 30.325, "completions/mean_terminated_length": 30.325, "completions/min_length": 20.6, "completions/min_terminated_length": 20.6, "epoch": 1.9025641025641025, "frac_reward_zero_std": 0.0, "grad_norm": 1.4518439769744873, "kl": 1.1801301002502442, "learning_rate": 1.5437768661518216e-05, "loss": 0.0657, "num_tokens": 2964755.0, "reward": 58.35861740112305, "reward_std": 29.095483779907227, "rewards/wrapper/mean": 14.589654579758644, "rewards/wrapper/std": 15.775352307409047, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 52.8, "completions/max_terminated_length": 52.8, "completions/mean_length": 29.8828125, "completions/mean_terminated_length": 29.8828125, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 1.953846153846154, "frac_reward_zero_std": 0.0, "grad_norm": 1.6311966180801392, "kl": 1.179757869243622, "learning_rate": 1.5109709636587557e-05, "loss": 0.0579, "num_tokens": 3025992.0, "reward": 66.08211746215821, "reward_std": 29.545513153076172, "rewards/wrapper/mean": 16.520529848337173, "rewards/wrapper/std": 14.267796693742275, "step": 190 }, { "epoch": 1.994871794871795, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": -1.0, "eval_completions/max_length": 39.62, "eval_completions/max_terminated_length": 39.62, "eval_completions/mean_length": 29.4125, "eval_completions/mean_terminated_length": 29.4125, "eval_completions/min_length": 22.36, "eval_completions/min_terminated_length": 22.36, "eval_frac_reward_zero_std": 0.005, "eval_kl": 1.3025571525096893, "eval_loss": 0.06812583655118942, "eval_num_tokens": 3075371.0, "eval_reward": 71.65954360961913, "eval_reward_std": 28.824577026367187, "eval_rewards/wrapper/mean": 17.914886049479247, "eval_rewards/wrapper/std": 13.78032234594226, "eval_runtime": 23.7513, "eval_samples_per_second": 8.421, "eval_steps_per_second": 1.053, "step": 194 } ], "logging_steps": 5, "max_steps": 194, "num_input_tokens_seen": 3075371, "num_train_epochs": 2, "save_steps": 200.0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }