{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.998452810727179, "eval_steps": 500, "global_step": 242, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1768.0, "completions/max_terminated_length": 1768.0, "completions/mean_length": 495.7567138671875, "completions/mean_terminated_length": 495.7567138671875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.0041258380608561115, "grad_norm": 0.27149179577827454, "kl": 0.0, "learning_rate": 1e-06, "loss": 0.0079, "num_tokens": 488918.0, "reward": 0.133928582072258, "reward_std": 0.23120081424713135, "rewards/code_format_reward/mean": 0.0401785708963871, "rewards/code_format_reward/std": 0.1965973675251007, "rewards/curriculum_aware_reward_fn/mean": 0.09375, "rewards/curriculum_aware_reward_fn/std": 0.2918064594268799, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1633.0, "completions/max_terminated_length": 1633.0, "completions/mean_length": 475.9910888671875, "completions/mean_terminated_length": 475.9910888671875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.008251676121712223, "grad_norm": 0.25897735357284546, "kl": 0.00028395652770996094, "learning_rate": 1e-06, "loss": 0.0064, "num_tokens": 951502.0, "reward": 0.180803582072258, "reward_std": 0.23178772628307343, "rewards/code_format_reward/mean": 0.0491071417927742, "rewards/code_format_reward/std": 0.2163332849740982, "rewards/curriculum_aware_reward_fn/mean": 0.1316964328289032, "rewards/curriculum_aware_reward_fn/std": 0.33853843808174133, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1475.0, "completions/max_terminated_length": 1475.0, "completions/mean_length": 462.0535888671875, "completions/mean_terminated_length": 462.0535888671875, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.012377514182568335, "grad_norm": 0.318085640668869, "kl": 0.00031566619873046875, "learning_rate": 1e-06, "loss": 0.0064, "num_tokens": 1410901.0, "reward": 0.15625, "reward_std": 0.24990491569042206, "rewards/code_format_reward/mean": 0.0580357126891613, "rewards/code_format_reward/std": 0.23407234251499176, "rewards/curriculum_aware_reward_fn/mean": 0.0982142835855484, "rewards/curriculum_aware_reward_fn/std": 0.29793688654899597, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1512.0, "completions/max_terminated_length": 1512.0, "completions/mean_length": 489.0469055175781, "completions/mean_terminated_length": 489.0469055175781, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.016503352243424446, "grad_norm": 0.2753855288028717, "kl": 0.00033855438232421875, "learning_rate": 1e-06, "loss": 0.0197, "num_tokens": 1910015.0, "reward": 0.1674107164144516, "reward_std": 0.2584620714187622, "rewards/code_format_reward/mean": 0.0669642835855484, "rewards/code_format_reward/std": 0.2502395808696747, "rewards/curriculum_aware_reward_fn/mean": 0.1004464253783226, "rewards/curriculum_aware_reward_fn/std": 0.30093035101890564, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2577.0, "completions/mean_length": 543.6585083007812, "completions/mean_terminated_length": 535.71142578125, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.020629190304280558, "grad_norm": 0.3211835026741028, "kl": 0.0004239082336425781, "learning_rate": 1e-06, "loss": 0.0377, "num_tokens": 2432517.0, "reward": 0.2120535969734192, "reward_std": 0.3129008412361145, "rewards/code_format_reward/mean": 0.1049107164144516, "rewards/code_format_reward/std": 0.3067808747291565, "rewards/curriculum_aware_reward_fn/mean": 0.1071428582072258, "rewards/curriculum_aware_reward_fn/std": 0.3096405565738678, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 1500.0, "completions/mean_length": 480.2745666503906, "completions/mean_terminated_length": 472.1856994628906, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.02475502836513667, "grad_norm": 0.42177459597587585, "kl": 0.0009064674377441406, "learning_rate": 1e-06, "loss": 0.0167, "num_tokens": 2922983.0, "reward": 0.3169642984867096, "reward_std": 0.4401297867298126, "rewards/code_format_reward/mean": 0.1941964328289032, "rewards/code_format_reward/std": 0.3960230052471161, "rewards/curriculum_aware_reward_fn/mean": 0.1227678582072258, "rewards/curriculum_aware_reward_fn/std": 0.3285374045372009, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2573.0, "completions/max_terminated_length": 2573.0, "completions/mean_length": 472.00225830078125, "completions/mean_terminated_length": 472.00225830078125, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.02888086642599278, "grad_norm": 0.39649832248687744, "kl": 0.0012350082397460938, "learning_rate": 1e-06, "loss": 0.0107, "num_tokens": 3415182.0, "reward": 0.3571428656578064, "reward_std": 0.4383024573326111, "rewards/code_format_reward/mean": 0.2299107164144516, "rewards/code_format_reward/std": 0.42124560475349426, "rewards/curriculum_aware_reward_fn/mean": 0.1272321492433548, "rewards/curriculum_aware_reward_fn/std": 0.3336053788661957, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 1813.0, "completions/mean_length": 470.8906555175781, "completions/mean_terminated_length": 462.78076171875, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.03300670448684889, "grad_norm": 0.47364914417266846, "kl": 0.0016469955444335938, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 3889474.0, "reward": 0.3950892984867096, "reward_std": 0.5098843574523926, "rewards/code_format_reward/mean": 0.3035714328289032, "rewards/code_format_reward/std": 0.46031373739242554, "rewards/curriculum_aware_reward_fn/mean": 0.0915178582072258, "rewards/curriculum_aware_reward_fn/std": 0.2886664867401123, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1799.0, "completions/max_terminated_length": 1799.0, "completions/mean_length": 459.6763610839844, "completions/mean_terminated_length": 459.6763610839844, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.037132542547705004, "grad_norm": 0.43468934297561646, "kl": 0.0029125213623046875, "learning_rate": 1e-06, "loss": 0.0284, "num_tokens": 4356908.0, "reward": 0.49776792526245117, "reward_std": 0.5324220061302185, "rewards/code_format_reward/mean": 0.4263392984867096, "rewards/code_format_reward/std": 0.49509721994400024, "rewards/curriculum_aware_reward_fn/mean": 0.0714285746216774, "rewards/curriculum_aware_reward_fn/std": 0.2578272819519043, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1736.0, "completions/max_terminated_length": 1736.0, "completions/mean_length": 417.02679443359375, "completions/mean_terminated_length": 417.02679443359375, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.041258380608561115, "grad_norm": 0.4652632772922516, "kl": 0.0046024322509765625, "learning_rate": 1e-06, "loss": 0.0009, "num_tokens": 4814181.0, "reward": 0.676339328289032, "reward_std": 0.5537610054016113, "rewards/code_format_reward/mean": 0.5758928656578064, "rewards/code_format_reward/std": 0.4947591722011566, "rewards/curriculum_aware_reward_fn/mean": 0.1004464253783226, "rewards/curriculum_aware_reward_fn/std": 0.30093035101890564, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3073.0, "completions/max_terminated_length": 3073.0, "completions/mean_length": 435.3906555175781, "completions/mean_terminated_length": 435.3906555175781, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.04538421866941723, "grad_norm": 0.41315850615501404, "kl": 0.0066375732421875, "learning_rate": 1e-06, "loss": 0.0128, "num_tokens": 5274284.0, "reward": 0.7745535969734192, "reward_std": 0.535895049571991, "rewards/code_format_reward/mean": 0.6696428656578064, "rewards/code_format_reward/std": 0.4708675146102905, "rewards/curriculum_aware_reward_fn/mean": 0.1049107164144516, "rewards/curriculum_aware_reward_fn/std": 0.3067809045314789, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2184.0, "completions/max_terminated_length": 2184.0, "completions/mean_length": 429.8794860839844, "completions/mean_terminated_length": 429.8794860839844, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.04951005673027334, "grad_norm": 0.41841644048690796, "kl": 0.007366180419921875, "learning_rate": 1e-06, "loss": -0.0417, "num_tokens": 5734809.0, "reward": 0.832589328289032, "reward_std": 0.4925895929336548, "rewards/code_format_reward/mean": 0.7209821343421936, "rewards/code_format_reward/std": 0.449017733335495, "rewards/curriculum_aware_reward_fn/mean": 0.1116071417927742, "rewards/curriculum_aware_reward_fn/std": 0.31523454189300537, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1761.0, "completions/max_terminated_length": 1761.0, "completions/mean_length": 432.1227722167969, "completions/mean_terminated_length": 432.1227722167969, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.05363589479112945, "grad_norm": 0.3938623368740082, "kl": 0.00563812255859375, "learning_rate": 1e-06, "loss": 0.0022, "num_tokens": 6196899.0, "reward": 0.879464328289032, "reward_std": 0.47405940294265747, "rewards/code_format_reward/mean": 0.765625, "rewards/code_format_reward/std": 0.42408111691474915, "rewards/curriculum_aware_reward_fn/mean": 0.1138392835855484, "rewards/curriculum_aware_reward_fn/std": 0.31797105073928833, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3185.0, "completions/max_terminated_length": 3185.0, "completions/mean_length": 417.5937805175781, "completions/mean_terminated_length": 417.5937805175781, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.05776173285198556, "grad_norm": 0.35270336270332336, "kl": 0.012054443359375, "learning_rate": 1e-06, "loss": -0.004, "num_tokens": 6669084.0, "reward": 0.9642857313156128, "reward_std": 0.4013085663318634, "rewards/code_format_reward/mean": 0.8191964030265808, "rewards/code_format_reward/std": 0.38528555631637573, "rewards/curriculum_aware_reward_fn/mean": 0.1450892835855484, "rewards/curriculum_aware_reward_fn/std": 0.352584570646286, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1630.0, "completions/max_terminated_length": 1630.0, "completions/mean_length": 411.1495666503906, "completions/mean_terminated_length": 411.1495666503906, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.06188757091284167, "grad_norm": 0.33457455039024353, "kl": 0.01195526123046875, "learning_rate": 1e-06, "loss": 0.0153, "num_tokens": 7107855.0, "reward": 1.055803656578064, "reward_std": 0.3924632966518402, "rewards/code_format_reward/mean": 0.8705357313156128, "rewards/code_format_reward/std": 0.3360883891582489, "rewards/curriculum_aware_reward_fn/mean": 0.1852678507566452, "rewards/curriculum_aware_reward_fn/std": 0.38894903659820557, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1988.0, "completions/max_terminated_length": 1988.0, "completions/mean_length": 428.9598388671875, "completions/mean_terminated_length": 428.9598388671875, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.06601340897369778, "grad_norm": 0.3297821581363678, "kl": 0.00592803955078125, "learning_rate": 1e-06, "loss": 0.0125, "num_tokens": 7584621.0, "reward": 1.0066965818405151, "reward_std": 0.3329208493232727, "rewards/code_format_reward/mean": 0.890625, "rewards/code_format_reward/std": 0.3124580383300781, "rewards/curriculum_aware_reward_fn/mean": 0.1160714253783226, "rewards/curriculum_aware_reward_fn/std": 0.32066863775253296, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2303.0, "completions/max_terminated_length": 2303.0, "completions/mean_length": 442.9442138671875, "completions/mean_terminated_length": 442.9442138671875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.07013924703455389, "grad_norm": 0.2896265387535095, "kl": 0.008983612060546875, "learning_rate": 1e-06, "loss": 0.0123, "num_tokens": 8042123.0, "reward": 0.9843750596046448, "reward_std": 0.264969140291214, "rewards/code_format_reward/mean": 0.9263392686843872, "rewards/code_format_reward/std": 0.2615099549293518, "rewards/curriculum_aware_reward_fn/mean": 0.0580357126891613, "rewards/curriculum_aware_reward_fn/std": 0.23407234251499176, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1397.0, "completions/max_terminated_length": 1397.0, "completions/mean_length": 415.1094055175781, "completions/mean_terminated_length": 415.1094055175781, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.07426508509541001, "grad_norm": 0.28677481412887573, "kl": 0.006587982177734375, "learning_rate": 1e-06, "loss": 0.0268, "num_tokens": 8497473.0, "reward": 1.0937501192092896, "reward_std": 0.2826189398765564, "rewards/code_format_reward/mean": 0.9553571343421936, "rewards/code_format_reward/std": 0.2067493200302124, "rewards/curriculum_aware_reward_fn/mean": 0.1383928507566452, "rewards/curriculum_aware_reward_fn/std": 0.34569787979125977, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1343.0, "completions/max_terminated_length": 1343.0, "completions/mean_length": 428.6383972167969, "completions/mean_terminated_length": 428.6383972167969, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.07839092315626611, "grad_norm": 0.2441217303276062, "kl": 0.011020660400390625, "learning_rate": 1e-06, "loss": 0.0067, "num_tokens": 8979982.0, "reward": 1.0669643878936768, "reward_std": 0.18279224634170532, "rewards/code_format_reward/mean": 0.96875, "rewards/code_format_reward/std": 0.17418713867664337, "rewards/curriculum_aware_reward_fn/mean": 0.0982142835855484, "rewards/curriculum_aware_reward_fn/std": 0.29793688654899597, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1687.0, "completions/max_terminated_length": 1687.0, "completions/mean_length": 444.2076110839844, "completions/mean_terminated_length": 444.2076110839844, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.08251676121712223, "grad_norm": 0.49515166878700256, "kl": 0.0452423095703125, "learning_rate": 1e-06, "loss": 0.0329, "num_tokens": 9448172.0, "reward": 1.111607313156128, "reward_std": 0.2632156014442444, "rewards/code_format_reward/mean": 0.9620535969734192, "rewards/code_format_reward/std": 0.191280335187912, "rewards/curriculum_aware_reward_fn/mean": 0.1495535671710968, "rewards/curriculum_aware_reward_fn/std": 0.35703200101852417, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1970.0, "completions/max_terminated_length": 1970.0, "completions/mean_length": 421.29241943359375, "completions/mean_terminated_length": 421.29241943359375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.08664259927797834, "grad_norm": 0.2293916791677475, "kl": 0.007633209228515625, "learning_rate": 1e-06, "loss": 0.0281, "num_tokens": 9897892.0, "reward": 1.0803571939468384, "reward_std": 0.19700251519680023, "rewards/code_format_reward/mean": 0.9776785969734192, "rewards/code_format_reward/std": 0.1478918492794037, "rewards/curriculum_aware_reward_fn/mean": 0.1026785746216774, "rewards/curriculum_aware_reward_fn/std": 0.30387789011001587, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1422.0, "completions/max_terminated_length": 1422.0, "completions/mean_length": 415.4933166503906, "completions/mean_terminated_length": 415.4933166503906, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.09076843733883445, "grad_norm": 0.21559171378612518, "kl": 0.0082244873046875, "learning_rate": 1e-06, "loss": 0.0128, "num_tokens": 10341164.0, "reward": 1.0892857313156128, "reward_std": 0.16836272180080414, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.1004464253783226, "rewards/curriculum_aware_reward_fn/std": 0.30093035101890564, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1567.0, "completions/max_terminated_length": 1567.0, "completions/mean_length": 408.7701110839844, "completions/mean_terminated_length": 408.7701110839844, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.09489427539969056, "grad_norm": 0.21035130321979523, "kl": 0.009906768798828125, "learning_rate": 1e-06, "loss": 0.0093, "num_tokens": 10786842.0, "reward": 1.0758929252624512, "reward_std": 0.1566229909658432, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.0848214253783226, "rewards/curriculum_aware_reward_fn/std": 0.2789272665977478, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1846.0, "completions/max_terminated_length": 1846.0, "completions/mean_length": 429.4821472167969, "completions/mean_terminated_length": 429.4821472167969, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.09902011346054668, "grad_norm": 0.2009419947862625, "kl": 0.008941650390625, "learning_rate": 1e-06, "loss": 0.0115, "num_tokens": 11251515.0, "reward": 1.0825893878936768, "reward_std": 0.1489134579896927, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.09375, "rewards/curriculum_aware_reward_fn/std": 0.2918064594268799, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 1105.0, "completions/mean_length": 405.55804443359375, "completions/mean_terminated_length": 397.302001953125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.10314595152140278, "grad_norm": 0.25105175375938416, "kl": 0.01844024658203125, "learning_rate": 1e-06, "loss": 0.0132, "num_tokens": 11703622.0, "reward": 1.149553656578064, "reward_std": 0.19908782839775085, "rewards/code_format_reward/mean": 0.9821428656578064, "rewards/code_format_reward/std": 0.13258016109466553, "rewards/curriculum_aware_reward_fn/mean": 0.1674107164144516, "rewards/curriculum_aware_reward_fn/std": 0.37375950813293457, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2064.0, "completions/max_terminated_length": 2064.0, "completions/mean_length": 431.9933166503906, "completions/mean_terminated_length": 431.9933166503906, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.1072717895822589, "grad_norm": 0.22682242095470428, "kl": 0.0138702392578125, "learning_rate": 1e-06, "loss": 0.0134, "num_tokens": 12174474.0, "reward": 1.1205357313156128, "reward_std": 0.2094394564628601, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.1316964328289032, "rewards/curriculum_aware_reward_fn/std": 0.33853843808174133, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2225.0, "completions/mean_length": 409.0089416503906, "completions/mean_terminated_length": 400.7606201171875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.111397627643115, "grad_norm": 0.2308988720178604, "kl": 0.01055908203125, "learning_rate": 1e-06, "loss": 0.0229, "num_tokens": 12633230.0, "reward": 1.1584821939468384, "reward_std": 0.21663519740104675, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.1629464328289032, "rewards/curriculum_aware_reward_fn/std": 0.3697296679019928, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2382.0, "completions/max_terminated_length": 2382.0, "completions/mean_length": 404.4933166503906, "completions/mean_terminated_length": 404.4933166503906, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.11552346570397112, "grad_norm": 0.2662091851234436, "kl": 0.01111602783203125, "learning_rate": 1e-06, "loss": 0.0229, "num_tokens": 13085651.0, "reward": 1.1071429252624512, "reward_std": 0.19443172216415405, "rewards/code_format_reward/mean": 0.9866071343421936, "rewards/code_format_reward/std": 0.11507843434810638, "rewards/curriculum_aware_reward_fn/mean": 0.1205357164144516, "rewards/curriculum_aware_reward_fn/std": 0.3259509205818176, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 1035.0, "completions/mean_length": 398.39288330078125, "completions/mean_terminated_length": 390.1208190917969, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.11964930376482723, "grad_norm": 0.2550451457500458, "kl": 0.01302337646484375, "learning_rate": 1e-06, "loss": 0.0396, "num_tokens": 13541121.0, "reward": 1.1383929252624512, "reward_std": 0.22243839502334595, "rewards/code_format_reward/mean": 0.9799107313156128, "rewards/code_format_reward/std": 0.14046260714530945, "rewards/curriculum_aware_reward_fn/mean": 0.1584821492433548, "rewards/curriculum_aware_reward_fn/std": 0.36560073494911194, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2087.0, "completions/max_terminated_length": 2087.0, "completions/mean_length": 398.12054443359375, "completions/mean_terminated_length": 398.12054443359375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.12377514182568335, "grad_norm": 0.26581788063049316, "kl": 0.01679229736328125, "learning_rate": 1e-06, "loss": 0.0216, "num_tokens": 13982847.0, "reward": 1.1450893878936768, "reward_std": 0.24129368364810944, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843171834946, "rewards/curriculum_aware_reward_fn/mean": 0.1517857164144516, "rewards/curriculum_aware_reward_fn/std": 0.3592141568660736, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1278.0, "completions/max_terminated_length": 1278.0, "completions/mean_length": 395.62725830078125, "completions/mean_terminated_length": 395.62725830078125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.12790097988653945, "grad_norm": 0.30181464552879333, "kl": 0.01206207275390625, "learning_rate": 1e-06, "loss": 0.0192, "num_tokens": 14442020.0, "reward": 1.1450893878936768, "reward_std": 0.2320331335067749, "rewards/code_format_reward/mean": 0.984375, "rewards/code_format_reward/std": 0.12415824085474014, "rewards/curriculum_aware_reward_fn/mean": 0.1607142835855484, "rewards/curriculum_aware_reward_fn/std": 0.3676777780056, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1305.0, "completions/max_terminated_length": 1305.0, "completions/mean_length": 385.6964416503906, "completions/mean_terminated_length": 385.6964416503906, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.13202681794739557, "grad_norm": 0.2625160217285156, "kl": 0.01248931884765625, "learning_rate": 1e-06, "loss": 0.0018, "num_tokens": 14885390.0, "reward": 1.1785714626312256, "reward_std": 0.2383158802986145, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.1875, "rewards/curriculum_aware_reward_fn/std": 0.3907487094402313, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1538.0, "completions/max_terminated_length": 1538.0, "completions/mean_length": 376.3035888671875, "completions/mean_terminated_length": 376.3035888671875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.1361526560082517, "grad_norm": 0.27431049942970276, "kl": 0.0135955810546875, "learning_rate": 1e-06, "loss": 0.016, "num_tokens": 15331615.0, "reward": 1.1383929252624512, "reward_std": 0.2413758635520935, "rewards/code_format_reward/mean": 0.9866071343421936, "rewards/code_format_reward/std": 0.11507843434810638, "rewards/curriculum_aware_reward_fn/mean": 0.1517857164144516, "rewards/curriculum_aware_reward_fn/std": 0.3592142164707184, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1417.0, "completions/max_terminated_length": 1417.0, "completions/mean_length": 392.32366943359375, "completions/mean_terminated_length": 392.32366943359375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.14027849406910778, "grad_norm": 0.25579994916915894, "kl": 0.02101898193359375, "learning_rate": 1e-06, "loss": 0.0296, "num_tokens": 15768546.0, "reward": 1.1473214626312256, "reward_std": 0.23827631771564484, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.1517857164144516, "rewards/curriculum_aware_reward_fn/std": 0.3714611530303955, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1428.0, "completions/max_terminated_length": 1428.0, "completions/mean_length": 376.0870666503906, "completions/mean_terminated_length": 376.0870666503906, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.1444043321299639, "grad_norm": 0.2485678791999817, "kl": 0.0143585205078125, "learning_rate": 1e-06, "loss": 0.0159, "num_tokens": 16221429.0, "reward": 1.1919643878936768, "reward_std": 0.2173694670200348, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.1964285671710968, "rewards/curriculum_aware_reward_fn/std": 0.39774051308631897, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 1457.0, "completions/mean_length": 382.54913330078125, "completions/mean_terminated_length": 374.2416076660156, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.14853017019082002, "grad_norm": 0.20657505095005035, "kl": 0.0134429931640625, "learning_rate": 1e-06, "loss": 0.0331, "num_tokens": 16662087.0, "reward": 1.087053656578064, "reward_std": 0.12363066524267197, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.0959821417927742, "rewards/curriculum_aware_reward_fn/std": 0.29489606618881226, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2187.0, "completions/max_terminated_length": 2187.0, "completions/mean_length": 391.2901916503906, "completions/mean_terminated_length": 391.2901916503906, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.15265600825167613, "grad_norm": 0.25777384638786316, "kl": 0.01773834228515625, "learning_rate": 1e-06, "loss": 0.025, "num_tokens": 17105457.0, "reward": 1.15625, "reward_std": 0.22190196812152863, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.1674107164144516, "rewards/curriculum_aware_reward_fn/std": 0.37375950813293457, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1098.0, "completions/max_terminated_length": 1098.0, "completions/mean_length": 368.5535888671875, "completions/mean_terminated_length": 368.5535888671875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.15678184631253222, "grad_norm": 0.2411869913339615, "kl": 0.0119781494140625, "learning_rate": 1e-06, "loss": -0.0019, "num_tokens": 17528057.0, "reward": 1.1406251192092896, "reward_std": 0.18761612474918365, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.1428571492433548, "rewards/curriculum_aware_reward_fn/std": 0.3503182828426361, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1161.0, "completions/max_terminated_length": 1161.0, "completions/mean_length": 381.8951110839844, "completions/mean_terminated_length": 381.8951110839844, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.16090768437338834, "grad_norm": 0.25597110390663147, "kl": 0.01470947265625, "learning_rate": 1e-06, "loss": 0.0084, "num_tokens": 17969787.0, "reward": 1.1227679252624512, "reward_std": 0.21488060057163239, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.1316964328289032, "rewards/curriculum_aware_reward_fn/std": 0.33853843808174133, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1164.0, "completions/max_terminated_length": 1164.0, "completions/mean_length": 369.27679443359375, "completions/mean_terminated_length": 369.27679443359375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.16503352243424446, "grad_norm": 0.2239292860031128, "kl": 0.0122833251953125, "learning_rate": 1e-06, "loss": 0.0078, "num_tokens": 18398864.0, "reward": 1.09375, "reward_std": 0.15493617951869965, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.0959821417927742, "rewards/curriculum_aware_reward_fn/std": 0.29489606618881226, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1706.0, "completions/max_terminated_length": 1706.0, "completions/mean_length": 367.14288330078125, "completions/mean_terminated_length": 367.14288330078125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.16915936049510058, "grad_norm": 0.2241775542497635, "kl": 0.012298583984375, "learning_rate": 1e-06, "loss": 0.0126, "num_tokens": 18833964.0, "reward": 1.1272320747375488, "reward_std": 0.16844365000724792, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843171834946, "rewards/curriculum_aware_reward_fn/mean": 0.1339285671710968, "rewards/curriculum_aware_reward_fn/std": 0.34095630049705505, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1169.0, "completions/max_terminated_length": 1169.0, "completions/mean_length": 324.0379638671875, "completions/mean_terminated_length": 324.0379638671875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.17328519855595667, "grad_norm": 0.30093643069267273, "kl": 0.01483917236328125, "learning_rate": 1e-06, "loss": 0.0134, "num_tokens": 19220507.0, "reward": 1.2299107313156128, "reward_std": 0.25780999660491943, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.2366071492433548, "rewards/curriculum_aware_reward_fn/std": 0.4254741966724396, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2147.0, "completions/max_terminated_length": 2147.0, "completions/mean_length": 357.3214416503906, "completions/mean_terminated_length": 357.3214416503906, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.1774110366168128, "grad_norm": 0.29579633474349976, "kl": 0.01251983642578125, "learning_rate": 1e-06, "loss": 0.0362, "num_tokens": 19641267.0, "reward": 1.196428656578064, "reward_std": 0.2200946807861328, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.2008928507566452, "rewards/curriculum_aware_reward_fn/std": 0.4011160135269165, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1694.0, "completions/max_terminated_length": 1694.0, "completions/mean_length": 368.7388610839844, "completions/mean_terminated_length": 368.7388610839844, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.1815368746776689, "grad_norm": 1.4610792398452759, "kl": 0.19786834716796875, "learning_rate": 1e-06, "loss": 0.0355, "num_tokens": 20051959.0, "reward": 1.1875, "reward_std": 0.24660581350326538, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.1941964328289032, "rewards/curriculum_aware_reward_fn/std": 0.3960230350494385, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1278.0, "completions/max_terminated_length": 1278.0, "completions/mean_length": 373.07366943359375, "completions/mean_terminated_length": 373.07366943359375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.18566271273852503, "grad_norm": 0.2601926624774933, "kl": 0.01108551025390625, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 20508518.0, "reward": 1.180803656578064, "reward_std": 0.22298960387706757, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.1852678507566452, "rewards/curriculum_aware_reward_fn/std": 0.3889490067958832, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1396.0, "completions/max_terminated_length": 1396.0, "completions/mean_length": 354.5714416503906, "completions/mean_terminated_length": 354.5714416503906, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.18978855079938112, "grad_norm": 0.4056430757045746, "kl": 0.05358123779296875, "learning_rate": 1e-06, "loss": 0.0271, "num_tokens": 20935515.0, "reward": 1.1250001192092896, "reward_std": 0.20658718049526215, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.1316964328289032, "rewards/curriculum_aware_reward_fn/std": 0.33853843808174133, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1301.0, "completions/max_terminated_length": 1301.0, "completions/mean_length": 365.9844055175781, "completions/mean_terminated_length": 365.9844055175781, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.19391438886023724, "grad_norm": 0.2424011379480362, "kl": 0.0110931396484375, "learning_rate": 1e-06, "loss": 0.0143, "num_tokens": 21369606.0, "reward": 1.171875, "reward_std": 0.1819511502981186, "rewards/code_format_reward/mean": 1.0, "rewards/code_format_reward/std": 0.0, "rewards/curriculum_aware_reward_fn/mean": 0.171875, "rewards/curriculum_aware_reward_fn/std": 0.3776935040950775, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1226.0, "completions/max_terminated_length": 1226.0, "completions/mean_length": 342.2344055175781, "completions/mean_terminated_length": 342.2344055175781, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.19804022692109335, "grad_norm": 0.2600545287132263, "kl": 0.01598358154296875, "learning_rate": 1e-06, "loss": 0.0135, "num_tokens": 21777608.0, "reward": 1.1741071939468384, "reward_std": 0.2212710827589035, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.1830357164144516, "rewards/curriculum_aware_reward_fn/std": 0.387128084897995, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 974.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 339.59375, "completions/mean_terminated_length": 339.59375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.20216606498194944, "grad_norm": 0.21470698714256287, "kl": 0.01085662841796875, "learning_rate": 1e-06, "loss": 0.0251, "num_tokens": 22199680.0, "reward": 1.1450893878936768, "reward_std": 0.1625155508518219, "rewards/code_format_reward/mean": 1.0, "rewards/code_format_reward/std": 0.0, "rewards/curriculum_aware_reward_fn/mean": 0.1450892835855484, "rewards/curriculum_aware_reward_fn/std": 0.3588734269142151, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 936.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 342.5469055175781, "completions/mean_terminated_length": 342.5469055175781, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.20629190304280556, "grad_norm": 0.2171521633863449, "kl": 0.015045166015625, "learning_rate": 1e-06, "loss": 0.0137, "num_tokens": 22613597.0, "reward": 1.1629464626312256, "reward_std": 0.1549137532711029, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.1651785671710968, "rewards/curriculum_aware_reward_fn/std": 0.37175679206848145, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1075.0, "completions/max_terminated_length": 1075.0, "completions/mean_length": 319.1942138671875, "completions/mean_terminated_length": 319.1942138671875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.21041774110366168, "grad_norm": 0.24528421461582184, "kl": 0.012359619140625, "learning_rate": 1e-06, "loss": 0.0218, "num_tokens": 23020496.0, "reward": 1.1629464626312256, "reward_std": 0.16526976227760315, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.171875, "rewards/curriculum_aware_reward_fn/std": 0.3776935040950775, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 924.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 372.7901916503906, "completions/mean_terminated_length": 372.7901916503906, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.2145435791645178, "grad_norm": 0.26358896493911743, "kl": 0.02085113525390625, "learning_rate": 1e-06, "loss": 0.0101, "num_tokens": 23478559.0, "reward": 1.1138393878936768, "reward_std": 0.2242853045463562, "rewards/code_format_reward/mean": 0.9754464030265808, "rewards/code_format_reward/std": 0.1549331247806549, "rewards/curriculum_aware_reward_fn/mean": 0.1383928507566452, "rewards/curriculum_aware_reward_fn/std": 0.34569787979125977, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1140.0, "completions/max_terminated_length": 1140.0, "completions/mean_length": 332.10491943359375, "completions/mean_terminated_length": 332.10491943359375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.2186694172253739, "grad_norm": 0.2957030236721039, "kl": 0.0118408203125, "learning_rate": 1e-06, "loss": 0.0364, "num_tokens": 23891158.0, "reward": 1.196428656578064, "reward_std": 0.2678048014640808, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843171834946, "rewards/curriculum_aware_reward_fn/mean": 0.203125, "rewards/curriculum_aware_reward_fn/std": 0.4027745723724365, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1022.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 347.8906555175781, "completions/mean_terminated_length": 347.8906555175781, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.22279525528623, "grad_norm": 0.24990341067314148, "kl": 0.01149749755859375, "learning_rate": 1e-06, "loss": 0.011, "num_tokens": 24300525.0, "reward": 1.109375, "reward_std": 0.18786342442035675, "rewards/code_format_reward/mean": 0.984375, "rewards/code_format_reward/std": 0.12415824085474014, "rewards/curriculum_aware_reward_fn/mean": 0.125, "rewards/curriculum_aware_reward_fn/std": 0.3310886323451996, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1209.0, "completions/max_terminated_length": 1209.0, "completions/mean_length": 340.3683166503906, "completions/mean_terminated_length": 340.3683166503906, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.22692109334708613, "grad_norm": 0.2709488570690155, "kl": 0.0154266357421875, "learning_rate": 1e-06, "loss": 0.021, "num_tokens": 24747332.0, "reward": 1.1584821939468384, "reward_std": 0.22672808170318604, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.1696428507566452, "rewards/curriculum_aware_reward_fn/std": 0.37573832273483276, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 1367.0, "completions/mean_length": 342.34600830078125, "completions/mean_terminated_length": 333.94854736328125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.23104693140794225, "grad_norm": 0.2587932348251343, "kl": 0.01580047607421875, "learning_rate": 1e-06, "loss": 0.0377, "num_tokens": 25190447.0, "reward": 1.1116071939468384, "reward_std": 0.19490285217761993, "rewards/code_format_reward/mean": 0.984375, "rewards/code_format_reward/std": 0.12415824085474014, "rewards/curriculum_aware_reward_fn/mean": 0.1272321492433548, "rewards/curriculum_aware_reward_fn/std": 0.3336053788661957, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1909.0, "completions/max_terminated_length": 1909.0, "completions/mean_length": 341.9687805175781, "completions/mean_terminated_length": 341.9687805175781, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.23517276946879834, "grad_norm": 0.23299667239189148, "kl": 0.0140533447265625, "learning_rate": 1e-06, "loss": 0.0167, "num_tokens": 25621395.0, "reward": 1.1294643878936768, "reward_std": 0.17807196080684662, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.140625, "rewards/curriculum_aware_reward_fn/std": 0.3480229377746582, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1345.0, "completions/max_terminated_length": 1345.0, "completions/mean_length": 305.4598388671875, "completions/mean_terminated_length": 305.4598388671875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.23929860752965446, "grad_norm": 0.3313480317592621, "kl": 0.0171356201171875, "learning_rate": 1e-06, "loss": 0.0425, "num_tokens": 26019166.0, "reward": 1.243303656578064, "reward_std": 0.3034422993659973, "rewards/code_format_reward/mean": 0.9732142686843872, "rewards/code_format_reward/std": 0.1616371124982834, "rewards/curriculum_aware_reward_fn/mean": 0.2700892984867096, "rewards/curriculum_aware_reward_fn/std": 0.4445020854473114, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1268.0, "completions/max_terminated_length": 1268.0, "completions/mean_length": 324.34375, "completions/mean_terminated_length": 324.34375, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.24342444559051057, "grad_norm": 0.2744191586971283, "kl": 0.015655517578125, "learning_rate": 1e-06, "loss": 0.0184, "num_tokens": 26434322.0, "reward": 1.1584821939468384, "reward_std": 0.2397712618112564, "rewards/code_format_reward/mean": 0.9732142686843872, "rewards/code_format_reward/std": 0.1616371124982834, "rewards/curriculum_aware_reward_fn/mean": 0.1852678507566452, "rewards/curriculum_aware_reward_fn/std": 0.38894903659820557, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1889.0, "completions/max_terminated_length": 1889.0, "completions/mean_length": 318.5692138671875, "completions/mean_terminated_length": 318.5692138671875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.2475502836513667, "grad_norm": 0.2724306583404541, "kl": 0.0138397216796875, "learning_rate": 1e-06, "loss": 0.0263, "num_tokens": 26858532.0, "reward": 1.171875, "reward_std": 0.22086723148822784, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843171834946, "rewards/curriculum_aware_reward_fn/mean": 0.1785714328289032, "rewards/curriculum_aware_reward_fn/std": 0.3834211826324463, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1191.0, "completions/max_terminated_length": 1191.0, "completions/mean_length": 314.8526916503906, "completions/mean_terminated_length": 314.8526916503906, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.2516761217122228, "grad_norm": 0.3130624294281006, "kl": 0.01505279541015625, "learning_rate": 1e-06, "loss": 0.024, "num_tokens": 27259811.0, "reward": 1.1584821939468384, "reward_std": 0.2217356413602829, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.1629464328289032, "rewards/curriculum_aware_reward_fn/std": 0.3697296679019928, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1020.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 317.8348388671875, "completions/mean_terminated_length": 317.8348388671875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.2558019597730789, "grad_norm": 0.34156128764152527, "kl": 0.02947235107421875, "learning_rate": 1e-06, "loss": 0.013, "num_tokens": 27685988.0, "reward": 1.1696429252624512, "reward_std": 0.2626109719276428, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.171875, "rewards/curriculum_aware_reward_fn/std": 0.38357093930244446, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 856.0, "completions/max_terminated_length": 856.0, "completions/mean_length": 306.6473388671875, "completions/mean_terminated_length": 306.6473388671875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.259927797833935, "grad_norm": 0.28197574615478516, "kl": 0.01535797119140625, "learning_rate": 1e-06, "loss": 0.0087, "num_tokens": 28085880.0, "reward": 1.2366071939468384, "reward_std": 0.2570823132991791, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.2477678507566452, "rewards/curriculum_aware_reward_fn/std": 0.4321989119052887, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1524.0, "completions/max_terminated_length": 1524.0, "completions/mean_length": 349.3883972167969, "completions/mean_terminated_length": 349.3883972167969, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.26405363589479114, "grad_norm": 0.2599468529224396, "kl": 0.02356719970703125, "learning_rate": 1e-06, "loss": 0.0352, "num_tokens": 28522287.0, "reward": 1.1116071939468384, "reward_std": 0.17702263593673706, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.1227678582072258, "rewards/curriculum_aware_reward_fn/std": 0.3285374045372009, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 1068.0, "completions/mean_length": 326.78125, "completions/mean_terminated_length": 318.3489990234375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.26817947395564723, "grad_norm": 0.2813323438167572, "kl": 0.01654052734375, "learning_rate": 1e-06, "loss": 0.0386, "num_tokens": 28938066.0, "reward": 1.140625, "reward_std": 0.25763756036758423, "rewards/code_format_reward/mean": 0.9776785969734192, "rewards/code_format_reward/std": 0.1478918492794037, "rewards/curriculum_aware_reward_fn/mean": 0.1629464328289032, "rewards/curriculum_aware_reward_fn/std": 0.3697296679019928, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 893.0, "completions/max_terminated_length": 893.0, "completions/mean_length": 293.7901916503906, "completions/mean_terminated_length": 293.7901916503906, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.2723053120165034, "grad_norm": 0.30718564987182617, "kl": 0.020751953125, "learning_rate": 1e-06, "loss": 0.0276, "num_tokens": 29339723.0, "reward": 1.1674107313156128, "reward_std": 0.2789299190044403, "rewards/code_format_reward/mean": 0.9732142686843872, "rewards/code_format_reward/std": 0.1616371124982834, "rewards/curriculum_aware_reward_fn/mean": 0.1941964328289032, "rewards/curriculum_aware_reward_fn/std": 0.3960230052471161, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1135.0, "completions/max_terminated_length": 1135.0, "completions/mean_length": 288.8571472167969, "completions/mean_terminated_length": 288.8571472167969, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.27643115007735947, "grad_norm": 0.3281542956829071, "kl": 0.045013427734375, "learning_rate": 1e-06, "loss": 0.0141, "num_tokens": 29726220.0, "reward": 1.1450893878936768, "reward_std": 0.1986456662416458, "rewards/code_format_reward/mean": 0.9821428656578064, "rewards/code_format_reward/std": 0.13258016109466553, "rewards/curriculum_aware_reward_fn/mean": 0.1629464328289032, "rewards/curriculum_aware_reward_fn/std": 0.3697296679019928, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2215.0, "completions/max_terminated_length": 2215.0, "completions/mean_length": 297.63616943359375, "completions/mean_terminated_length": 297.63616943359375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.28055698813821556, "grad_norm": 0.2958904802799225, "kl": 0.01969146728515625, "learning_rate": 1e-06, "loss": 0.0435, "num_tokens": 30123839.0, "reward": 1.234375, "reward_std": 0.29684755206108093, "rewards/code_format_reward/mean": 0.9866071343421936, "rewards/code_format_reward/std": 0.11507843434810638, "rewards/curriculum_aware_reward_fn/mean": 0.2477678507566452, "rewards/curriculum_aware_reward_fn/std": 0.4321989119052887, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1223.0, "completions/max_terminated_length": 1223.0, "completions/mean_length": 293.0379638671875, "completions/mean_terminated_length": 293.0379638671875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.2846828261990717, "grad_norm": 0.28549474477767944, "kl": 0.022735595703125, "learning_rate": 1e-06, "loss": 0.0132, "num_tokens": 30513112.0, "reward": 1.140625, "reward_std": 0.21831631660461426, "rewards/code_format_reward/mean": 0.9866071343421936, "rewards/code_format_reward/std": 0.11507844179868698, "rewards/curriculum_aware_reward_fn/mean": 0.1540178507566452, "rewards/curriculum_aware_reward_fn/std": 0.36136940121650696, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 956.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 295.7477722167969, "completions/mean_terminated_length": 295.7477722167969, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.2888086642599278, "grad_norm": 0.2900505065917969, "kl": 0.0165252685546875, "learning_rate": 1e-06, "loss": 0.0067, "num_tokens": 30901226.0, "reward": 1.140625, "reward_std": 0.20353375375270844, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.1473214328289032, "rewards/curriculum_aware_reward_fn/std": 0.3548222482204437, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1147.0, "completions/max_terminated_length": 1147.0, "completions/mean_length": 288.2165222167969, "completions/mean_terminated_length": 288.2165222167969, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.2929345023207839, "grad_norm": 0.31868776679039, "kl": 0.017059326171875, "learning_rate": 1e-06, "loss": 0.017, "num_tokens": 31304019.0, "reward": 1.2232143878936768, "reward_std": 0.2826867401599884, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.2321428507566452, "rewards/curriculum_aware_reward_fn/std": 0.4226716160774231, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1519.0, "completions/max_terminated_length": 1519.0, "completions/mean_length": 296.65850830078125, "completions/mean_terminated_length": 296.65850830078125, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.29706034038164003, "grad_norm": 0.29709187150001526, "kl": 0.01712799072265625, "learning_rate": 1e-06, "loss": 0.0238, "num_tokens": 31717377.0, "reward": 1.1473214626312256, "reward_std": 0.23680853843688965, "rewards/code_format_reward/mean": 0.9799107313156128, "rewards/code_format_reward/std": 0.14046260714530945, "rewards/curriculum_aware_reward_fn/mean": 0.1674107164144516, "rewards/curriculum_aware_reward_fn/std": 0.37375950813293457, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1136.0, "completions/max_terminated_length": 1136.0, "completions/mean_length": 288.1004638671875, "completions/mean_terminated_length": 288.1004638671875, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.3011861784424961, "grad_norm": 0.2860942482948303, "kl": 0.0174102783203125, "learning_rate": 1e-06, "loss": 0.0182, "num_tokens": 32111697.0, "reward": 1.118303656578064, "reward_std": 0.21186980605125427, "rewards/code_format_reward/mean": 0.9776785969734192, "rewards/code_format_reward/std": 0.1478918492794037, "rewards/curriculum_aware_reward_fn/mean": 0.140625, "rewards/curriculum_aware_reward_fn/std": 0.36065009236335754, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2343.0, "completions/max_terminated_length": 2343.0, "completions/mean_length": 316.82366943359375, "completions/mean_terminated_length": 316.82366943359375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.30531201650335227, "grad_norm": 0.2843763530254364, "kl": 0.02020263671875, "learning_rate": 1e-06, "loss": 0.0501, "num_tokens": 32546437.0, "reward": 1.1205357313156128, "reward_std": 0.21844343841075897, "rewards/code_format_reward/mean": 0.9799107313156128, "rewards/code_format_reward/std": 0.14046260714530945, "rewards/curriculum_aware_reward_fn/mean": 0.140625, "rewards/curriculum_aware_reward_fn/std": 0.3480229377746582, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1899.0, "completions/max_terminated_length": 1899.0, "completions/mean_length": 290.46875, "completions/mean_terminated_length": 290.46875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.30943785456420836, "grad_norm": 0.2609272003173828, "kl": 0.01959991455078125, "learning_rate": 1e-06, "loss": 0.0218, "num_tokens": 32942910.0, "reward": 1.1875, "reward_std": 0.21647420525550842, "rewards/code_format_reward/mean": 0.9821428656578064, "rewards/code_format_reward/std": 0.13258016109466553, "rewards/curriculum_aware_reward_fn/mean": 0.2053571492433548, "rewards/curriculum_aware_reward_fn/std": 0.4099084138870239, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 1232.0, "completions/mean_length": 309.2745666503906, "completions/mean_terminated_length": 300.8031311035156, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.31356369262506445, "grad_norm": 0.30251428484916687, "kl": 0.02301025390625, "learning_rate": 1e-06, "loss": 0.022, "num_tokens": 33366442.0, "reward": 1.1205358505249023, "reward_std": 0.2311568409204483, "rewards/code_format_reward/mean": 0.9754464030265808, "rewards/code_format_reward/std": 0.1549331247806549, "rewards/curriculum_aware_reward_fn/mean": 0.1450892835855484, "rewards/curriculum_aware_reward_fn/std": 0.352584570646286, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1616.0, "completions/max_terminated_length": 1616.0, "completions/mean_length": 308.82366943359375, "completions/mean_terminated_length": 308.82366943359375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.3176895306859206, "grad_norm": 0.28500479459762573, "kl": 0.02339935302734375, "learning_rate": 1e-06, "loss": 0.0252, "num_tokens": 33784109.0, "reward": 1.1607143878936768, "reward_std": 0.2052522748708725, "rewards/code_format_reward/mean": 0.9866071343421936, "rewards/code_format_reward/std": 0.11507843434810638, "rewards/curriculum_aware_reward_fn/mean": 0.1741071492433548, "rewards/curriculum_aware_reward_fn/std": 0.4080280661582947, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 929.0, "completions/max_terminated_length": 929.0, "completions/mean_length": 298.5, "completions/mean_terminated_length": 298.5, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.3218153687467767, "grad_norm": 0.2972322702407837, "kl": 0.01988983154296875, "learning_rate": 1e-06, "loss": 0.0229, "num_tokens": 34193385.0, "reward": 1.1428571939468384, "reward_std": 0.23864628374576569, "rewards/code_format_reward/mean": 0.9821428656578064, "rewards/code_format_reward/std": 0.13258016109466553, "rewards/curriculum_aware_reward_fn/mean": 0.1607142835855484, "rewards/curriculum_aware_reward_fn/std": 0.3676777780056, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1443.0, "completions/max_terminated_length": 1443.0, "completions/mean_length": 296.9151916503906, "completions/mean_terminated_length": 296.9151916503906, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.3259412068076328, "grad_norm": 0.30418986082077026, "kl": 0.0212860107421875, "learning_rate": 1e-06, "loss": 0.0172, "num_tokens": 34597688.0, "reward": 1.1785714626312256, "reward_std": 0.23233453929424286, "rewards/code_format_reward/mean": 0.9866071343421936, "rewards/code_format_reward/std": 0.11507844179868698, "rewards/curriculum_aware_reward_fn/mean": 0.1919642835855484, "rewards/curriculum_aware_reward_fn/std": 0.3942854404449463, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1181.0, "completions/max_terminated_length": 1181.0, "completions/mean_length": 293.2745666503906, "completions/mean_terminated_length": 293.2745666503906, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.3300670448684889, "grad_norm": 0.26860466599464417, "kl": 0.0164947509765625, "learning_rate": 1e-06, "loss": 0.0261, "num_tokens": 34991365.0, "reward": 1.1696429252624512, "reward_std": 0.2292947769165039, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.1741071492433548, "rewards/curriculum_aware_reward_fn/std": 0.37962549924850464, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1957.0, "completions/max_terminated_length": 1957.0, "completions/mean_length": 294.60491943359375, "completions/mean_terminated_length": 294.60491943359375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.334192882929345, "grad_norm": 0.29946035146713257, "kl": 0.024200439453125, "learning_rate": 1e-06, "loss": -0.0022, "num_tokens": 35394752.0, "reward": 1.1852679252624512, "reward_std": 0.21951647102832794, "rewards/code_format_reward/mean": 0.9799107313156128, "rewards/code_format_reward/std": 0.14046260714530945, "rewards/curriculum_aware_reward_fn/mean": 0.2053571492433548, "rewards/curriculum_aware_reward_fn/std": 0.4099084138870239, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1190.0, "completions/max_terminated_length": 1190.0, "completions/mean_length": 297.3013610839844, "completions/mean_terminated_length": 297.3013610839844, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.33831872099020116, "grad_norm": 0.27935469150543213, "kl": 0.02008056640625, "learning_rate": 1e-06, "loss": 0.0016, "num_tokens": 35796267.0, "reward": 1.15625, "reward_std": 0.19204403460025787, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.1629464328289032, "rewards/curriculum_aware_reward_fn/std": 0.3697296679019928, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 953.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 311.8035888671875, "completions/mean_terminated_length": 311.8035888671875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.34244455905105725, "grad_norm": 0.29073479771614075, "kl": 0.0170745849609375, "learning_rate": 1e-06, "loss": 0.0052, "num_tokens": 36227939.0, "reward": 1.1919643878936768, "reward_std": 0.2728196084499359, "rewards/code_format_reward/mean": 0.984375, "rewards/code_format_reward/std": 0.12415824085474014, "rewards/curriculum_aware_reward_fn/mean": 0.2075892835855484, "rewards/curriculum_aware_reward_fn/std": 0.4060344398021698, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1078.0, "completions/max_terminated_length": 1078.0, "completions/mean_length": 324.9129638671875, "completions/mean_terminated_length": 324.9129638671875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.34657039711191334, "grad_norm": 0.2512301206588745, "kl": 0.01975250244140625, "learning_rate": 1e-06, "loss": 0.0136, "num_tokens": 36656433.0, "reward": 1.1674107313156128, "reward_std": 0.2097610980272293, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.171875, "rewards/curriculum_aware_reward_fn/std": 0.3776935040950775, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1543.0, "completions/max_terminated_length": 1543.0, "completions/mean_length": 314.734375, "completions/mean_terminated_length": 314.734375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.3506962351727695, "grad_norm": 0.2750551998615265, "kl": 0.01879119873046875, "learning_rate": 1e-06, "loss": 0.0105, "num_tokens": 37078162.0, "reward": 1.15625, "reward_std": 0.22576312720775604, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.1674107164144516, "rewards/curriculum_aware_reward_fn/std": 0.37375950813293457, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2680.0, "completions/max_terminated_length": 2680.0, "completions/mean_length": 296.37054443359375, "completions/mean_terminated_length": 296.37054443359375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.3548220732336256, "grad_norm": 0.25370389223098755, "kl": 0.022674560546875, "learning_rate": 1e-06, "loss": 0.0378, "num_tokens": 37468480.0, "reward": 1.118303656578064, "reward_std": 0.17590686678886414, "rewards/code_format_reward/mean": 0.984375, "rewards/code_format_reward/std": 0.12415824085474014, "rewards/curriculum_aware_reward_fn/mean": 0.1339285671710968, "rewards/curriculum_aware_reward_fn/std": 0.34095630049705505, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 972.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 324.3169860839844, "completions/mean_terminated_length": 324.3169860839844, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.35894791129448167, "grad_norm": 0.24342288076877594, "kl": 0.01543426513671875, "learning_rate": 1e-06, "loss": 0.0237, "num_tokens": 37898128.0, "reward": 1.165178656578064, "reward_std": 0.20304544270038605, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.171875, "rewards/curriculum_aware_reward_fn/std": 0.3776935040950775, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 797.0, "completions/max_terminated_length": 797.0, "completions/mean_length": 315.9620666503906, "completions/mean_terminated_length": 315.9620666503906, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.3630737493553378, "grad_norm": 0.2748737633228302, "kl": 0.01773834228515625, "learning_rate": 1e-06, "loss": 0.0129, "num_tokens": 38315395.0, "reward": 1.1852679252624512, "reward_std": 0.21113555133342743, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.1941964328289032, "rewards/curriculum_aware_reward_fn/std": 0.3960230350494385, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1337.0, "completions/max_terminated_length": 1337.0, "completions/mean_length": 327.4375, "completions/mean_terminated_length": 327.4375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.3671995874161939, "grad_norm": 0.26676246523857117, "kl": 0.0194549560546875, "learning_rate": 1e-06, "loss": 0.0078, "num_tokens": 38743963.0, "reward": 1.1629464626312256, "reward_std": 0.2198539525270462, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.1696428507566452, "rewards/curriculum_aware_reward_fn/std": 0.37573832273483276, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 859.0, "completions/max_terminated_length": 859.0, "completions/mean_length": 315.9888610839844, "completions/mean_terminated_length": 315.9888610839844, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.37132542547705005, "grad_norm": 0.3028186857700348, "kl": 0.0179290771484375, "learning_rate": 1e-06, "loss": 0.0187, "num_tokens": 39165057.0, "reward": 1.2232143878936768, "reward_std": 0.2633320689201355, "rewards/code_format_reward/mean": 1.0, "rewards/code_format_reward/std": 0.0, "rewards/curriculum_aware_reward_fn/mean": 0.2232142835855484, "rewards/curriculum_aware_reward_fn/std": 0.41686636209487915, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1181.0, "completions/max_terminated_length": 1181.0, "completions/mean_length": 323.0, "completions/mean_terminated_length": 323.0, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.37545126353790614, "grad_norm": 0.274135559797287, "kl": 0.0183258056640625, "learning_rate": 1e-06, "loss": 0.0034, "num_tokens": 39587941.0, "reward": 1.156250238418579, "reward_std": 0.2299468219280243, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.1674107164144516, "rewards/curriculum_aware_reward_fn/std": 0.37375950813293457, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1103.0, "completions/max_terminated_length": 1103.0, "completions/mean_length": 317.8526916503906, "completions/mean_terminated_length": 317.8526916503906, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.37957710159876223, "grad_norm": 0.26673728227615356, "kl": 0.0158538818359375, "learning_rate": 1e-06, "loss": 0.0057, "num_tokens": 39989003.0, "reward": 1.1607143878936768, "reward_std": 0.22157248854637146, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.1651785671710968, "rewards/curriculum_aware_reward_fn/std": 0.3836035132408142, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1242.0, "completions/max_terminated_length": 1242.0, "completions/mean_length": 316.2723388671875, "completions/mean_terminated_length": 316.2723388671875, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.3837029396596184, "grad_norm": 0.31823644042015076, "kl": 0.017242431640625, "learning_rate": 1e-06, "loss": 0.034, "num_tokens": 40404162.0, "reward": 1.265625, "reward_std": 0.3281380832195282, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.2745535671710968, "rewards/curriculum_aware_reward_fn/std": 0.46638673543930054, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1087.0, "completions/max_terminated_length": 1087.0, "completions/mean_length": 329.4754638671875, "completions/mean_terminated_length": 329.4754638671875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.38782877772047447, "grad_norm": 0.273196816444397, "kl": 0.019561767578125, "learning_rate": 1e-06, "loss": 0.0143, "num_tokens": 40817277.0, "reward": 1.171875, "reward_std": 0.23742373287677765, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843171834946, "rewards/curriculum_aware_reward_fn/mean": 0.1785714328289032, "rewards/curriculum_aware_reward_fn/std": 0.3834211826324463, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3163.0, "completions/max_terminated_length": 3163.0, "completions/mean_length": 331.68304443359375, "completions/mean_terminated_length": 331.68304443359375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.39195461578133056, "grad_norm": 0.27656933665275574, "kl": 0.0191192626953125, "learning_rate": 1e-06, "loss": 0.013, "num_tokens": 41241601.0, "reward": 1.1897321939468384, "reward_std": 0.23682093620300293, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.1919642835855484, "rewards/curriculum_aware_reward_fn/std": 0.3942854404449463, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1727.0, "completions/max_terminated_length": 1727.0, "completions/mean_length": 335.328125, "completions/mean_terminated_length": 335.328125, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.3960804538421867, "grad_norm": 0.24343284964561462, "kl": 0.018890380859375, "learning_rate": 1e-06, "loss": 0.0165, "num_tokens": 41663057.0, "reward": 1.2165179252624512, "reward_std": 0.19940510392189026, "rewards/code_format_reward/mean": 1.0, "rewards/code_format_reward/std": 0.0, "rewards/curriculum_aware_reward_fn/mean": 0.2165178507566452, "rewards/curriculum_aware_reward_fn/std": 0.41233164072036743, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1417.0, "completions/max_terminated_length": 1417.0, "completions/mean_length": 344.95538330078125, "completions/mean_terminated_length": 344.95538330078125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.4002062919030428, "grad_norm": 0.2739112079143524, "kl": 0.0171356201171875, "learning_rate": 1e-06, "loss": 0.0269, "num_tokens": 42083576.0, "reward": 1.2165179252624512, "reward_std": 0.27025100588798523, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.2209821492433548, "rewards/curriculum_aware_reward_fn/std": 0.43638429045677185, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1154.0, "completions/max_terminated_length": 1154.0, "completions/mean_length": 338.8526916503906, "completions/mean_terminated_length": 338.8526916503906, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.4043321299638989, "grad_norm": 0.2864993214607239, "kl": 0.01848602294921875, "learning_rate": 1e-06, "loss": 0.004, "num_tokens": 42511056.0, "reward": 1.2053571939468384, "reward_std": 0.26153260469436646, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.2098214328289032, "rewards/curriculum_aware_reward_fn/std": 0.40763622522354126, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 1221.0, "completions/mean_length": 348.0870666503906, "completions/mean_terminated_length": 331.2802734375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.40845796802475504, "grad_norm": 0.2637680470943451, "kl": 0.0213165283203125, "learning_rate": 1e-06, "loss": 0.0467, "num_tokens": 42937213.0, "reward": 1.265625, "reward_std": 0.24608273804187775, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.2745535671710968, "rewards/curriculum_aware_reward_fn/std": 0.4467879831790924, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 1028.0, "completions/mean_length": 342.16741943359375, "completions/mean_terminated_length": 333.7695617675781, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.4125838060856111, "grad_norm": 0.25878703594207764, "kl": 0.0192718505859375, "learning_rate": 1e-06, "loss": 0.0328, "num_tokens": 43355139.0, "reward": 1.2008929252624512, "reward_std": 0.2060384899377823, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.2053571492433548, "rewards/curriculum_aware_reward_fn/std": 0.40441396832466125, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1008.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 363.6294860839844, "completions/mean_terminated_length": 363.6294860839844, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.4167096441464673, "grad_norm": 0.4346523880958557, "kl": 0.04067230224609375, "learning_rate": 1e-06, "loss": 0.0141, "num_tokens": 43801055.0, "reward": 1.1875, "reward_std": 0.23577158153057098, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.1919642835855484, "rewards/curriculum_aware_reward_fn/std": 0.3942854404449463, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 930.0, "completions/max_terminated_length": 930.0, "completions/mean_length": 339.3571472167969, "completions/mean_terminated_length": 339.3571472167969, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.42083548220732336, "grad_norm": 0.24393494427204132, "kl": 0.0178985595703125, "learning_rate": 1e-06, "loss": 0.0052, "num_tokens": 44238435.0, "reward": 1.1584821939468384, "reward_std": 0.1854330599308014, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.1607142835855484, "rewards/curriculum_aware_reward_fn/std": 0.3676777780056, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1804.0, "completions/max_terminated_length": 1804.0, "completions/mean_length": 362.6227722167969, "completions/mean_terminated_length": 362.6227722167969, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.42496132026817945, "grad_norm": 0.27865517139434814, "kl": 0.0175628662109375, "learning_rate": 1e-06, "loss": 0.0234, "num_tokens": 44681324.0, "reward": 1.1830357313156128, "reward_std": 0.2439030110836029, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.1941964328289032, "rewards/curriculum_aware_reward_fn/std": 0.3960230350494385, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 837.0, "completions/max_terminated_length": 837.0, "completions/mean_length": 355.48663330078125, "completions/mean_terminated_length": 355.48663330078125, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.4290871583290356, "grad_norm": 0.28729158639907837, "kl": 0.0159454345703125, "learning_rate": 1e-06, "loss": 0.029, "num_tokens": 45096565.0, "reward": 1.180803656578064, "reward_std": 0.2765403389930725, "rewards/code_format_reward/mean": 0.984375, "rewards/code_format_reward/std": 0.12415824085474014, "rewards/curriculum_aware_reward_fn/mean": 0.1964285671710968, "rewards/curriculum_aware_reward_fn/std": 0.39774051308631897, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 920.0, "completions/max_terminated_length": 920.0, "completions/mean_length": 317.984375, "completions/mean_terminated_length": 317.984375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.4332129963898917, "grad_norm": 0.2705976963043213, "kl": 0.017242431640625, "learning_rate": 1e-06, "loss": 0.0056, "num_tokens": 45499100.0, "reward": 1.2522321939468384, "reward_std": 0.21364028751850128, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.2544642984867096, "rewards/curriculum_aware_reward_fn/std": 0.4360465705394745, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1927.0, "completions/max_terminated_length": 1927.0, "completions/mean_length": 342.2232360839844, "completions/mean_terminated_length": 342.2232360839844, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.4373388344507478, "grad_norm": 0.2950671911239624, "kl": 0.02044677734375, "learning_rate": 1e-06, "loss": 0.0298, "num_tokens": 45926452.0, "reward": 1.1607143878936768, "reward_std": 0.2596372663974762, "rewards/code_format_reward/mean": 0.9732142686843872, "rewards/code_format_reward/std": 0.1616371124982834, "rewards/curriculum_aware_reward_fn/mean": 0.1875, "rewards/curriculum_aware_reward_fn/std": 0.3907487094402313, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1037.0, "completions/max_terminated_length": 1037.0, "completions/mean_length": 326.4598388671875, "completions/mean_terminated_length": 326.4598388671875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.44146467251160393, "grad_norm": 0.2920423746109009, "kl": 0.01879119873046875, "learning_rate": 1e-06, "loss": 0.0273, "num_tokens": 46325511.0, "reward": 1.212053656578064, "reward_std": 0.2717861235141754, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.2232142835855484, "rewards/curriculum_aware_reward_fn/std": 0.41686636209487915, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1131.0, "completions/max_terminated_length": 1131.0, "completions/mean_length": 326.546875, "completions/mean_terminated_length": 326.546875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.44559051057246, "grad_norm": 0.2972775995731354, "kl": 0.0187225341796875, "learning_rate": 1e-06, "loss": 0.005, "num_tokens": 46739932.0, "reward": 1.220982313156128, "reward_std": 0.2685100734233856, "rewards/code_format_reward/mean": 0.984375, "rewards/code_format_reward/std": 0.12415824085474014, "rewards/curriculum_aware_reward_fn/mean": 0.2366071492433548, "rewards/curriculum_aware_reward_fn/std": 0.4254741966724396, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1091.0, "completions/max_terminated_length": 1091.0, "completions/mean_length": 326.4263610839844, "completions/mean_terminated_length": 326.4263610839844, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.44971634863331617, "grad_norm": 0.2996078431606293, "kl": 0.01970672607421875, "learning_rate": 1e-06, "loss": -0.0221, "num_tokens": 47153550.0, "reward": 1.2299107313156128, "reward_std": 0.2617761194705963, "rewards/code_format_reward/mean": 0.9866071343421936, "rewards/code_format_reward/std": 0.11507844179868698, "rewards/curriculum_aware_reward_fn/mean": 0.2433035671710968, "rewards/curriculum_aware_reward_fn/std": 0.42955654859542847, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1417.0, "completions/max_terminated_length": 1417.0, "completions/mean_length": 336.40850830078125, "completions/mean_terminated_length": 336.40850830078125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.45384218669417226, "grad_norm": 0.2998186945915222, "kl": 0.0244903564453125, "learning_rate": 1e-06, "loss": 0.0203, "num_tokens": 47576281.0, "reward": 1.1785714626312256, "reward_std": 0.26998862624168396, "rewards/code_format_reward/mean": 0.9799107313156128, "rewards/code_format_reward/std": 0.14046260714530945, "rewards/curriculum_aware_reward_fn/mean": 0.1986607164144516, "rewards/curriculum_aware_reward_fn/std": 0.3994380831718445, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1145.0, "completions/max_terminated_length": 1145.0, "completions/mean_length": 323.6607360839844, "completions/mean_terminated_length": 323.6607360839844, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.45796802475502835, "grad_norm": 0.2849138379096985, "kl": 0.0202178955078125, "learning_rate": 1e-06, "loss": 0.0227, "num_tokens": 47966729.0, "reward": 1.2522321939468384, "reward_std": 0.24761906266212463, "rewards/code_format_reward/mean": 0.9866071343421936, "rewards/code_format_reward/std": 0.11507843434810638, "rewards/curriculum_aware_reward_fn/mean": 0.265625, "rewards/curriculum_aware_reward_fn/std": 0.44215917587280273, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 882.0, "completions/max_terminated_length": 882.0, "completions/mean_length": 324.5379638671875, "completions/mean_terminated_length": 324.5379638671875, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.4620938628158845, "grad_norm": 0.2679043412208557, "kl": 0.02386474609375, "learning_rate": 1e-06, "loss": 0.0046, "num_tokens": 48394033.0, "reward": 1.1540179252624512, "reward_std": 0.18728980422019958, "rewards/code_format_reward/mean": 0.984375, "rewards/code_format_reward/std": 0.12415824085474014, "rewards/curriculum_aware_reward_fn/mean": 0.1696428507566452, "rewards/curriculum_aware_reward_fn/std": 0.37573832273483276, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1158.0, "completions/max_terminated_length": 1158.0, "completions/mean_length": 332.8883972167969, "completions/mean_terminated_length": 332.8883972167969, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.4662197008767406, "grad_norm": 0.2767100930213928, "kl": 0.01922607421875, "learning_rate": 1e-06, "loss": 0.0305, "num_tokens": 48828354.0, "reward": 1.1830357313156128, "reward_std": 0.22662557661533356, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.1941964328289032, "rewards/curriculum_aware_reward_fn/std": 0.3960230052471161, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1182.0, "completions/max_terminated_length": 1182.0, "completions/mean_length": 317.3482360839844, "completions/mean_terminated_length": 317.3482360839844, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.4703455389375967, "grad_norm": 0.23760177195072174, "kl": 0.0172271728515625, "learning_rate": 1e-06, "loss": 0.021, "num_tokens": 49222715.0, "reward": 1.2232143878936768, "reward_std": 0.20727001130580902, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.2254464328289032, "rewards/curriculum_aware_reward_fn/std": 0.41834312677383423, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1320.0, "completions/max_terminated_length": 1320.0, "completions/mean_length": 333.2567138671875, "completions/mean_terminated_length": 333.2567138671875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.4744713769984528, "grad_norm": 0.2735610902309418, "kl": 0.02111053466796875, "learning_rate": 1e-06, "loss": -0.0025, "num_tokens": 49633471.0, "reward": 1.212053656578064, "reward_std": 0.2395302802324295, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.21875, "rewards/curriculum_aware_reward_fn/std": 0.4297715127468109, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2041.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 347.7031555175781, "completions/mean_terminated_length": 347.7031555175781, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.4785972150593089, "grad_norm": 0.23646758496761322, "kl": 0.02182769775390625, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 50054696.0, "reward": 1.203125, "reward_std": 0.19250856339931488, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.2075892835855484, "rewards/curriculum_aware_reward_fn/std": 0.4060344398021698, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1199.0, "completions/max_terminated_length": 1199.0, "completions/mean_length": 347.05804443359375, "completions/mean_terminated_length": 347.05804443359375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.48272305312016506, "grad_norm": 0.2396235316991806, "kl": 0.01979827880859375, "learning_rate": 1e-06, "loss": -0.0053, "num_tokens": 50489128.0, "reward": 1.171875, "reward_std": 0.18470536172389984, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.1741071492433548, "rewards/curriculum_aware_reward_fn/std": 0.37962549924850464, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1582.0, "completions/max_terminated_length": 1582.0, "completions/mean_length": 333.6026916503906, "completions/mean_terminated_length": 333.6026916503906, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.48684889118102115, "grad_norm": 0.2912699580192566, "kl": 0.01638031005859375, "learning_rate": 1e-06, "loss": 0.0056, "num_tokens": 50909804.0, "reward": 1.180803656578064, "reward_std": 0.23006919026374817, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.1897321492433548, "rewards/curriculum_aware_reward_fn/std": 0.39252743124961853, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1610.0, "completions/max_terminated_length": 1610.0, "completions/mean_length": 341.90179443359375, "completions/mean_terminated_length": 341.90179443359375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.49097472924187724, "grad_norm": 0.24366678297519684, "kl": 0.017578125, "learning_rate": 1e-06, "loss": 0.0137, "num_tokens": 51335430.0, "reward": 1.171875, "reward_std": 0.19099467992782593, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.1741071492433548, "rewards/curriculum_aware_reward_fn/std": 0.4188501238822937, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1388.0, "completions/max_terminated_length": 1388.0, "completions/mean_length": 336.9665222167969, "completions/mean_terminated_length": 336.9665222167969, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.4951005673027334, "grad_norm": 0.2316557914018631, "kl": 0.02017974853515625, "learning_rate": 1e-06, "loss": 0.0136, "num_tokens": 51748982.0, "reward": 1.165178656578064, "reward_std": 0.18219274282455444, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.1674107164144516, "rewards/curriculum_aware_reward_fn/std": 0.37375950813293457, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2587.0, "completions/max_terminated_length": 2587.0, "completions/mean_length": 340.4732360839844, "completions/mean_terminated_length": 340.4732360839844, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.4992264053635895, "grad_norm": 0.2609340250492096, "kl": 0.0164337158203125, "learning_rate": 1e-06, "loss": -0.0014, "num_tokens": 52168662.0, "reward": 1.1919643878936768, "reward_std": 0.19617266952991486, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.1941964328289032, "rewards/curriculum_aware_reward_fn/std": 0.3960230350494385, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1317.0, "completions/max_terminated_length": 1317.0, "completions/mean_length": 331.5133972167969, "completions/mean_terminated_length": 331.5133972167969, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.5033522434244456, "grad_norm": 0.26645025610923767, "kl": 0.0178375244140625, "learning_rate": 1e-06, "loss": 0.023, "num_tokens": 52583075.0, "reward": 1.2075893878936768, "reward_std": 0.24217791855335236, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.2120535671710968, "rewards/curriculum_aware_reward_fn/std": 0.40921956300735474, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1281.0, "completions/max_terminated_length": 1281.0, "completions/mean_length": 347.04241943359375, "completions/mean_terminated_length": 347.04241943359375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.5074780814853017, "grad_norm": 0.26141101121902466, "kl": 0.01815032958984375, "learning_rate": 1e-06, "loss": 0.0061, "num_tokens": 53020237.0, "reward": 1.1897321939468384, "reward_std": 0.20019538700580597, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.1964285671710968, "rewards/curriculum_aware_reward_fn/std": 0.39774051308631897, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 780.0, "completions/max_terminated_length": 780.0, "completions/mean_length": 320.2120666503906, "completions/mean_terminated_length": 320.2120666503906, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.5116039195461578, "grad_norm": 0.28270936012268066, "kl": 0.01805877685546875, "learning_rate": 1e-06, "loss": 0.0222, "num_tokens": 53419759.0, "reward": 1.1763393878936768, "reward_std": 0.23801317811012268, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.1875, "rewards/curriculum_aware_reward_fn/std": 0.3907487094402313, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2187.0, "completions/max_terminated_length": 2187.0, "completions/mean_length": 336.64288330078125, "completions/mean_terminated_length": 336.64288330078125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.5157297576070139, "grad_norm": 0.2931639850139618, "kl": 0.0214996337890625, "learning_rate": 1e-06, "loss": 0.0228, "num_tokens": 53833061.0, "reward": 1.25, "reward_std": 0.2512167692184448, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.2566964328289032, "rewards/curriculum_aware_reward_fn/std": 0.4372987747192383, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 796.0, "completions/max_terminated_length": 796.0, "completions/mean_length": 326.7276916503906, "completions/mean_terminated_length": 326.7276916503906, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.51985559566787, "grad_norm": 0.2922307848930359, "kl": 0.018096923828125, "learning_rate": 1e-06, "loss": 0.0018, "num_tokens": 54238848.0, "reward": 1.2232143878936768, "reward_std": 0.25884348154067993, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.234375, "rewards/curriculum_aware_reward_fn/std": 0.42408111691474915, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1066.0, "completions/max_terminated_length": 1066.0, "completions/mean_length": 353.67413330078125, "completions/mean_terminated_length": 353.67413330078125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.5239814337287262, "grad_norm": 0.26413822174072266, "kl": 0.01689910888671875, "learning_rate": 1e-06, "loss": 0.0039, "num_tokens": 54670700.0, "reward": 1.1741071939468384, "reward_std": 0.2503284215927124, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.1763392835855484, "rewards/curriculum_aware_reward_fn/std": 0.3873537480831146, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1398.0, "completions/max_terminated_length": 1398.0, "completions/mean_length": 335.6607360839844, "completions/mean_terminated_length": 335.6607360839844, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.5281072717895823, "grad_norm": 0.24284829199314117, "kl": 0.01674652099609375, "learning_rate": 1e-06, "loss": 0.0082, "num_tokens": 55087727.0, "reward": 1.1696429252624512, "reward_std": 0.17461249232292175, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.171875, "rewards/curriculum_aware_reward_fn/std": 0.3776935040950775, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1108.0, "completions/max_terminated_length": 1108.0, "completions/mean_length": 344.2344055175781, "completions/mean_terminated_length": 344.2344055175781, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.5322331098504384, "grad_norm": 0.28230011463165283, "kl": 0.019317626953125, "learning_rate": 1e-06, "loss": -0.0014, "num_tokens": 55539913.0, "reward": 1.2075893878936768, "reward_std": 0.253488689661026, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.21875, "rewards/curriculum_aware_reward_fn/std": 0.4138607978820801, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 329.9151916503906, "completions/mean_terminated_length": 321.48992919921875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.5363589479112945, "grad_norm": 0.26032939553260803, "kl": 0.01861572265625, "learning_rate": 1e-06, "loss": 0.014, "num_tokens": 55954359.0, "reward": 1.1941964626312256, "reward_std": 0.22125084698200226, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.1986607164144516, "rewards/curriculum_aware_reward_fn/std": 0.41048669815063477, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2736.0, "completions/max_terminated_length": 2736.0, "completions/mean_length": 331.8638610839844, "completions/mean_terminated_length": 331.8638610839844, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.5404847859721505, "grad_norm": 0.23292717337608337, "kl": 0.01721954345703125, "learning_rate": 1e-06, "loss": 0.0055, "num_tokens": 56381039.0, "reward": 1.1785714626312256, "reward_std": 0.18270127475261688, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.1808035671710968, "rewards/curriculum_aware_reward_fn/std": 0.3852856159210205, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 942.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 344.79913330078125, "completions/mean_terminated_length": 344.79913330078125, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.5446106240330068, "grad_norm": 0.2904663383960724, "kl": 0.01805877685546875, "learning_rate": 1e-06, "loss": 0.0139, "num_tokens": 56818036.0, "reward": 1.2075893878936768, "reward_std": 0.2461196929216385, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843171834946, "rewards/curriculum_aware_reward_fn/mean": 0.2142857164144516, "rewards/curriculum_aware_reward_fn/std": 0.4107845723628998, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 341.51116943359375, "completions/mean_terminated_length": 333.1118469238281, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.5487364620938628, "grad_norm": 0.26703622937202454, "kl": 0.01871490478515625, "learning_rate": 1e-06, "loss": 0.0226, "num_tokens": 57249500.0, "reward": 1.140625, "reward_std": 0.2159074991941452, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.1473214328289032, "rewards/curriculum_aware_reward_fn/std": 0.3548222482204437, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1043.0, "completions/max_terminated_length": 1043.0, "completions/mean_length": 337.7410888671875, "completions/mean_terminated_length": 337.7410888671875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.5528623001547189, "grad_norm": 0.2243984490633011, "kl": 0.01727294921875, "learning_rate": 1e-06, "loss": 0.0125, "num_tokens": 57671316.0, "reward": 1.1361607313156128, "reward_std": 0.17991097271442413, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.1383928507566452, "rewards/curriculum_aware_reward_fn/std": 0.34569787979125977, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1182.0, "completions/max_terminated_length": 1182.0, "completions/mean_length": 347.38616943359375, "completions/mean_terminated_length": 347.38616943359375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.556988138215575, "grad_norm": 0.2889977991580963, "kl": 0.01727294921875, "learning_rate": 1e-06, "loss": 0.0102, "num_tokens": 58110613.0, "reward": 1.25, "reward_std": 0.2606619894504547, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.2522321343421936, "rewards/curriculum_aware_reward_fn/std": 0.4347793161869049, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1757.0, "completions/max_terminated_length": 1757.0, "completions/mean_length": 324.65179443359375, "completions/mean_terminated_length": 324.65179443359375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.5611139762764311, "grad_norm": 0.28462642431259155, "kl": 0.01802825927734375, "learning_rate": 1e-06, "loss": 0.0154, "num_tokens": 58517892.0, "reward": 1.2075893878936768, "reward_std": 0.24189238250255585, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.2098214328289032, "rewards/curriculum_aware_reward_fn/std": 0.40763622522354126, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1370.0, "completions/max_terminated_length": 1370.0, "completions/mean_length": 328.5245666503906, "completions/mean_terminated_length": 328.5245666503906, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.5652398143372873, "grad_norm": 0.2616114318370819, "kl": 0.02053070068359375, "learning_rate": 1e-06, "loss": -0.0115, "num_tokens": 58930581.0, "reward": 1.2321429252624512, "reward_std": 0.20312771201133728, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.234375, "rewards/curriculum_aware_reward_fn/std": 0.42408111691474915, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2079.0, "completions/max_terminated_length": 2079.0, "completions/mean_length": 333.92413330078125, "completions/mean_terminated_length": 333.92413330078125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.5693656523981434, "grad_norm": 0.2755897343158722, "kl": 0.017974853515625, "learning_rate": 1e-06, "loss": 0.0202, "num_tokens": 59356112.0, "reward": 1.2142857313156128, "reward_std": 0.26117143034935, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.2165178507566452, "rewards/curriculum_aware_reward_fn/std": 0.4282991886138916, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1089.0, "completions/max_terminated_length": 1089.0, "completions/mean_length": 325.5290222167969, "completions/mean_terminated_length": 325.5290222167969, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.5734914904589995, "grad_norm": 0.2825402021408081, "kl": 0.01815032958984375, "learning_rate": 1e-06, "loss": -0.0068, "num_tokens": 59752710.0, "reward": 1.2098215818405151, "reward_std": 0.24761906266212463, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.2120535671710968, "rewards/curriculum_aware_reward_fn/std": 0.40921953320503235, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3108.0, "completions/max_terminated_length": 3108.0, "completions/mean_length": 337.11163330078125, "completions/mean_terminated_length": 337.11163330078125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.5776173285198556, "grad_norm": 0.2626936137676239, "kl": 0.018463134765625, "learning_rate": 1e-06, "loss": 0.0044, "num_tokens": 60179088.0, "reward": 1.2209821939468384, "reward_std": 0.21219147741794586, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.2232142835855484, "rewards/curriculum_aware_reward_fn/std": 0.41686636209487915, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1162.0, "completions/max_terminated_length": 1162.0, "completions/mean_length": 338.984375, "completions/mean_terminated_length": 338.984375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.5817431665807117, "grad_norm": 0.3992301821708679, "kl": 0.05078887939453125, "learning_rate": 1e-06, "loss": -0.001, "num_tokens": 60601671.0, "reward": 1.196428656578064, "reward_std": 0.24828356504440308, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.2075892835855484, "rewards/curriculum_aware_reward_fn/std": 0.4060344398021698, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1084.0, "completions/max_terminated_length": 1084.0, "completions/mean_length": 351.26116943359375, "completions/mean_terminated_length": 351.26116943359375, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.5858690046415678, "grad_norm": 0.2878393530845642, "kl": 0.01871490478515625, "learning_rate": 1e-06, "loss": 0.0227, "num_tokens": 61048794.0, "reward": 1.2522321939468384, "reward_std": 0.28646785020828247, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.2544642984867096, "rewards/curriculum_aware_reward_fn/std": 0.46098586916923523, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 1218.0, "completions/mean_length": 353.5714416503906, "completions/mean_terminated_length": 345.1990966796875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.589994842702424, "grad_norm": 0.2666623890399933, "kl": 0.01739501953125, "learning_rate": 1e-06, "loss": 0.0347, "num_tokens": 61488657.0, "reward": 1.212053656578064, "reward_std": 0.2365129441022873, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.2142857164144516, "rewards/curriculum_aware_reward_fn/std": 0.4107845723628998, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1314.0, "completions/max_terminated_length": 1314.0, "completions/mean_length": 335.1540222167969, "completions/mean_terminated_length": 335.1540222167969, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.5941206807632801, "grad_norm": 0.2896396219730377, "kl": 0.0229034423828125, "learning_rate": 1e-06, "loss": 0.0356, "num_tokens": 61897869.0, "reward": 1.205357313156128, "reward_std": 0.27187272906303406, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.2165178507566452, "rewards/curriculum_aware_reward_fn/std": 0.41233164072036743, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1130.0, "completions/max_terminated_length": 1130.0, "completions/mean_length": 346.90179443359375, "completions/mean_terminated_length": 346.90179443359375, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.5982465188241362, "grad_norm": 0.299888014793396, "kl": 0.0253143310546875, "learning_rate": 1e-06, "loss": 0.0186, "num_tokens": 62322620.0, "reward": 1.2366071939468384, "reward_std": 0.27944642305374146, "rewards/code_format_reward/mean": 0.9866071343421936, "rewards/code_format_reward/std": 0.11507844179868698, "rewards/curriculum_aware_reward_fn/mean": 0.25, "rewards/curriculum_aware_reward_fn/std": 0.43349677324295044, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3885.0, "completions/max_terminated_length": 3885.0, "completions/mean_length": 329.046875, "completions/mean_terminated_length": 329.046875, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.6023723568849922, "grad_norm": 0.30937013030052185, "kl": 0.0206298828125, "learning_rate": 1e-06, "loss": 0.0079, "num_tokens": 62725715.0, "reward": 1.227678656578064, "reward_std": 0.27645939588546753, "rewards/code_format_reward/mean": 0.984375, "rewards/code_format_reward/std": 0.12415824085474014, "rewards/curriculum_aware_reward_fn/mean": 0.2433035671710968, "rewards/curriculum_aware_reward_fn/std": 0.4499065577983856, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1304.0, "completions/max_terminated_length": 1304.0, "completions/mean_length": 351.7410888671875, "completions/mean_terminated_length": 351.7410888671875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.6064981949458483, "grad_norm": 0.27438926696777344, "kl": 0.0195770263671875, "learning_rate": 1e-06, "loss": -0.0052, "num_tokens": 63149099.0, "reward": 1.180803656578064, "reward_std": 0.2631767690181732, "rewards/code_format_reward/mean": 0.9866071343421936, "rewards/code_format_reward/std": 0.11507843434810638, "rewards/curriculum_aware_reward_fn/mean": 0.1941964328289032, "rewards/curriculum_aware_reward_fn/std": 0.40163227915763855, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2189.0, "completions/max_terminated_length": 2189.0, "completions/mean_length": 339.8883972167969, "completions/mean_terminated_length": 339.8883972167969, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.6106240330067045, "grad_norm": 0.30993056297302246, "kl": 0.02044677734375, "learning_rate": 1e-06, "loss": 0.0022, "num_tokens": 63572969.0, "reward": 1.1674107313156128, "reward_std": 0.2310422956943512, "rewards/code_format_reward/mean": 0.984375, "rewards/code_format_reward/std": 0.12415824085474014, "rewards/curriculum_aware_reward_fn/mean": 0.1830357164144516, "rewards/curriculum_aware_reward_fn/std": 0.387128084897995, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1523.0, "completions/max_terminated_length": 1523.0, "completions/mean_length": 344.9375305175781, "completions/mean_terminated_length": 344.9375305175781, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.6147498710675606, "grad_norm": 0.2932162582874298, "kl": 0.019500732421875, "learning_rate": 1e-06, "loss": 0.012, "num_tokens": 64015936.0, "reward": 1.2142857313156128, "reward_std": 0.2733439803123474, "rewards/code_format_reward/mean": 0.9866071343421936, "rewards/code_format_reward/std": 0.11507844179868698, "rewards/curriculum_aware_reward_fn/mean": 0.2276785671710968, "rewards/curriculum_aware_reward_fn/std": 0.41980284452438354, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1156.0, "completions/max_terminated_length": 1156.0, "completions/mean_length": 373.8169860839844, "completions/mean_terminated_length": 373.8169860839844, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.6188757091284167, "grad_norm": 0.3013036251068115, "kl": 0.023681640625, "learning_rate": 1e-06, "loss": 0.0063, "num_tokens": 64469097.0, "reward": 1.171875, "reward_std": 0.2643316686153412, "rewards/code_format_reward/mean": 0.9776785969734192, "rewards/code_format_reward/std": 0.1478918492794037, "rewards/curriculum_aware_reward_fn/mean": 0.1941964328289032, "rewards/curriculum_aware_reward_fn/std": 0.3960230350494385, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1098.0, "completions/max_terminated_length": 1098.0, "completions/mean_length": 336.671875, "completions/mean_terminated_length": 336.671875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.6230015471892728, "grad_norm": 0.26782163977622986, "kl": 0.02001953125, "learning_rate": 1e-06, "loss": 0.0064, "num_tokens": 64883014.0, "reward": 1.1696429252624512, "reward_std": 0.24146796762943268, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.1785714328289032, "rewards/curriculum_aware_reward_fn/std": 0.3834212124347687, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1448.0, "completions/max_terminated_length": 1448.0, "completions/mean_length": 339.7589416503906, "completions/mean_terminated_length": 339.7589416503906, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.6271273852501289, "grad_norm": 0.27384746074676514, "kl": 0.019866943359375, "learning_rate": 1e-06, "loss": 0.0006, "num_tokens": 65292168.0, "reward": 1.1629464626312256, "reward_std": 0.20063751935958862, "rewards/code_format_reward/mean": 0.9866071343421936, "rewards/code_format_reward/std": 0.11507843434810638, "rewards/curriculum_aware_reward_fn/mean": 0.1763392835855484, "rewards/curriculum_aware_reward_fn/std": 0.3815346360206604, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1031.0, "completions/max_terminated_length": 1031.0, "completions/mean_length": 337.1183166503906, "completions/mean_terminated_length": 337.1183166503906, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.631253223310985, "grad_norm": 0.3167986273765564, "kl": 0.01904296875, "learning_rate": 1e-06, "loss": 0.0036, "num_tokens": 65717534.0, "reward": 1.2321430444717407, "reward_std": 0.31447455286979675, "rewards/code_format_reward/mean": 0.984375, "rewards/code_format_reward/std": 0.12415824085474014, "rewards/curriculum_aware_reward_fn/mean": 0.2477678507566452, "rewards/curriculum_aware_reward_fn/std": 0.4321989119052887, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1075.0, "completions/max_terminated_length": 1075.0, "completions/mean_length": 360.9263610839844, "completions/mean_terminated_length": 360.9263610839844, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.6353790613718412, "grad_norm": 0.24413932859897614, "kl": 0.0199432373046875, "learning_rate": 1e-06, "loss": -0.0008, "num_tokens": 66149170.0, "reward": 1.1428571939468384, "reward_std": 0.17920571565628052, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.1517857164144516, "rewards/curriculum_aware_reward_fn/std": 0.3592142164707184, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1496.0, "completions/max_terminated_length": 1496.0, "completions/mean_length": 383.3995666503906, "completions/mean_terminated_length": 383.3995666503906, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.6395048994326973, "grad_norm": 0.2513328790664673, "kl": 0.0194244384765625, "learning_rate": 1e-06, "loss": 0.0167, "num_tokens": 66603516.0, "reward": 1.1941964626312256, "reward_std": 0.22007882595062256, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.203125, "rewards/curriculum_aware_reward_fn/std": 0.4027745723724365, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1104.0, "completions/max_terminated_length": 1104.0, "completions/mean_length": 332.9977722167969, "completions/mean_terminated_length": 332.9977722167969, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.6436307374935534, "grad_norm": 0.25629723072052, "kl": 0.020050048828125, "learning_rate": 1e-06, "loss": 0.0032, "num_tokens": 67028135.0, "reward": 1.2254464626312256, "reward_std": 0.21020323038101196, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.2276785671710968, "rewards/curriculum_aware_reward_fn/std": 0.41980284452438354, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1470.0, "completions/max_terminated_length": 1470.0, "completions/mean_length": 333.421875, "completions/mean_terminated_length": 333.421875, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.6477565755544095, "grad_norm": 0.23201420903205872, "kl": 0.0207977294921875, "learning_rate": 1e-06, "loss": 0.0165, "num_tokens": 67455121.0, "reward": 1.171875, "reward_std": 0.19378496706485748, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.1785714328289032, "rewards/curriculum_aware_reward_fn/std": 0.39491814374923706, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1117.0, "completions/max_terminated_length": 1117.0, "completions/mean_length": 353.9821472167969, "completions/mean_terminated_length": 353.9821472167969, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.6518824136152656, "grad_norm": 0.3111206889152527, "kl": 0.01873779296875, "learning_rate": 1e-06, "loss": 0.0214, "num_tokens": 67857928.0, "reward": 1.2410714626312256, "reward_std": 0.27191102504730225, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.2455357164144516, "rewards/curriculum_aware_reward_fn/std": 0.43088552355766296, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 916.0, "completions/max_terminated_length": 916.0, "completions/mean_length": 334.4598388671875, "completions/mean_terminated_length": 334.4598388671875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.6560082516761218, "grad_norm": 0.2842690646648407, "kl": 0.019073486328125, "learning_rate": 1e-06, "loss": 0.0052, "num_tokens": 68288179.0, "reward": 1.212053656578064, "reward_std": 0.23378773033618927, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.2209821492433548, "rewards/curriculum_aware_reward_fn/std": 0.4153723120689392, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1198.0, "completions/max_terminated_length": 1198.0, "completions/mean_length": 350.80804443359375, "completions/mean_terminated_length": 350.80804443359375, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.6601340897369778, "grad_norm": 0.2639565169811249, "kl": 0.02203369140625, "learning_rate": 1e-06, "loss": 0.0089, "num_tokens": 68745207.0, "reward": 1.1294643878936768, "reward_std": 0.20632404088974, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.140625, "rewards/curriculum_aware_reward_fn/std": 0.3480229377746582, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1107.0, "completions/max_terminated_length": 1107.0, "completions/mean_length": 337.3839416503906, "completions/mean_terminated_length": 337.3839416503906, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.6642599277978339, "grad_norm": 0.2646699547767639, "kl": 0.02032470703125, "learning_rate": 1e-06, "loss": -0.003, "num_tokens": 69168991.0, "reward": 1.2209821939468384, "reward_std": 0.2513192594051361, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.2232142835855484, "rewards/curriculum_aware_reward_fn/std": 0.4221988022327423, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 1277.0, "completions/mean_length": 338.0513610839844, "completions/mean_terminated_length": 329.644287109375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.66838576585869, "grad_norm": 0.25248733162879944, "kl": 0.0230255126953125, "learning_rate": 1e-06, "loss": 0.0112, "num_tokens": 69589329.0, "reward": 1.1629464626312256, "reward_std": 0.19969065487384796, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.171875, "rewards/curriculum_aware_reward_fn/std": 0.3776935040950775, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1299.0, "completions/max_terminated_length": 1299.0, "completions/mean_length": 333.8415222167969, "completions/mean_terminated_length": 333.8415222167969, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.6725116039195461, "grad_norm": 0.2794862985610962, "kl": 0.02447509765625, "learning_rate": 1e-06, "loss": 0.0087, "num_tokens": 70005730.0, "reward": 1.21875, "reward_std": 0.25172531604766846, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.2232142835855484, "rewards/curriculum_aware_reward_fn/std": 0.41686636209487915, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 872.0, "completions/max_terminated_length": 872.0, "completions/mean_length": 318.1629638671875, "completions/mean_terminated_length": 318.1629638671875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.6766374419804023, "grad_norm": 0.26063379645347595, "kl": 0.0210113525390625, "learning_rate": 1e-06, "loss": 0.0235, "num_tokens": 70418866.0, "reward": 1.1785714626312256, "reward_std": 0.19073154032230377, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.1852678507566452, "rewards/curriculum_aware_reward_fn/std": 0.38894903659820557, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1022.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 326.5915222167969, "completions/mean_terminated_length": 326.5915222167969, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.6807632800412584, "grad_norm": 0.30665120482444763, "kl": 0.020233154296875, "learning_rate": 1e-06, "loss": 0.0072, "num_tokens": 70844892.0, "reward": 1.2098214626312256, "reward_std": 0.2596711814403534, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.2120535671710968, "rewards/curriculum_aware_reward_fn/std": 0.42001092433929443, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 752.0, "completions/max_terminated_length": 752.0, "completions/mean_length": 326.13616943359375, "completions/mean_terminated_length": 326.13616943359375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.6848891181021145, "grad_norm": 0.2908986508846283, "kl": 0.0253753662109375, "learning_rate": 1e-06, "loss": 0.0201, "num_tokens": 71276216.0, "reward": 1.2008929252624512, "reward_std": 0.23872500658035278, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843171834946, "rewards/curriculum_aware_reward_fn/mean": 0.2075892835855484, "rewards/curriculum_aware_reward_fn/std": 0.4060344398021698, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 753.0, "completions/max_terminated_length": 753.0, "completions/mean_length": 306.9620666503906, "completions/mean_terminated_length": 306.9620666503906, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.6890149561629706, "grad_norm": 0.291533887386322, "kl": 0.0217437744140625, "learning_rate": 1e-06, "loss": 0.008, "num_tokens": 71680113.0, "reward": 1.2232143878936768, "reward_std": 0.2559709846973419, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.2276785671710968, "rewards/curriculum_aware_reward_fn/std": 0.41980281472206116, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1468.0, "completions/max_terminated_length": 1468.0, "completions/mean_length": 348.7031555175781, "completions/mean_terminated_length": 348.7031555175781, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.6931407942238267, "grad_norm": 0.24353955686092377, "kl": 0.02020263671875, "learning_rate": 1e-06, "loss": 0.0048, "num_tokens": 72127000.0, "reward": 1.15625, "reward_std": 0.19178088009357452, "rewards/code_format_reward/mean": 1.0, "rewards/code_format_reward/std": 0.0, "rewards/curriculum_aware_reward_fn/mean": 0.15625, "rewards/curriculum_aware_reward_fn/std": 0.36349809169769287, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 993.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 324.6473388671875, "completions/mean_terminated_length": 324.6473388671875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.6972666322846828, "grad_norm": 0.3178284764289856, "kl": 0.0223236083984375, "learning_rate": 1e-06, "loss": -0.0047, "num_tokens": 72557895.0, "reward": 1.2209821939468384, "reward_std": 0.28647440671920776, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.2232142835855484, "rewards/curriculum_aware_reward_fn/std": 0.41686636209487915, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1461.0, "completions/max_terminated_length": 1461.0, "completions/mean_length": 334.3526916503906, "completions/mean_terminated_length": 334.3526916503906, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.701392470345539, "grad_norm": 0.26770561933517456, "kl": 0.0296630859375, "learning_rate": 1e-06, "loss": 0.0166, "num_tokens": 72999879.0, "reward": 1.2008929252624512, "reward_std": 0.22885264456272125, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.2075892835855484, "rewards/curriculum_aware_reward_fn/std": 0.4060344398021698, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1711.0, "completions/max_terminated_length": 1711.0, "completions/mean_length": 340.5535888671875, "completions/mean_terminated_length": 340.5535888671875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.7055183084063951, "grad_norm": 0.24942228198051453, "kl": 0.022125244140625, "learning_rate": 1e-06, "loss": 0.0111, "num_tokens": 73426266.0, "reward": 1.2008929252624512, "reward_std": 0.20286457240581512, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.203125, "rewards/curriculum_aware_reward_fn/std": 0.4027745723724365, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1770.0, "completions/max_terminated_length": 1770.0, "completions/mean_length": 319.29241943359375, "completions/mean_terminated_length": 319.29241943359375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.7096441464672512, "grad_norm": 0.24910351634025574, "kl": 0.0234375, "learning_rate": 1e-06, "loss": 0.0058, "num_tokens": 73829751.0, "reward": 1.2388393878936768, "reward_std": 0.1940256804227829, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.2410714328289032, "rewards/curriculum_aware_reward_fn/std": 0.4282117187976837, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1039.0, "completions/max_terminated_length": 1039.0, "completions/mean_length": 312.25, "completions/mean_terminated_length": 312.25, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.7137699845281072, "grad_norm": 0.2692999839782715, "kl": 0.0258026123046875, "learning_rate": 1e-06, "loss": 0.0128, "num_tokens": 74226560.0, "reward": 1.2723214626312256, "reward_std": 0.24515041708946228, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.2767857015132904, "rewards/curriculum_aware_reward_fn/std": 0.44790977239608765, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2291.0, "completions/max_terminated_length": 2291.0, "completions/mean_length": 309.7901916503906, "completions/mean_terminated_length": 309.7901916503906, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.7178958225889633, "grad_norm": 0.30629029870033264, "kl": 0.026947021484375, "learning_rate": 1e-06, "loss": 0.0133, "num_tokens": 74631913.0, "reward": 1.2321429252624512, "reward_std": 0.27668297290802, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.2366071492433548, "rewards/curriculum_aware_reward_fn/std": 0.4254741966724396, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 898.0, "completions/max_terminated_length": 898.0, "completions/mean_length": 311.7165222167969, "completions/mean_terminated_length": 311.7165222167969, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.7220216606498195, "grad_norm": 0.26726552844047546, "kl": 0.0240936279296875, "learning_rate": 1e-06, "loss": 0.0097, "num_tokens": 75070126.0, "reward": 1.203125, "reward_std": 0.18516990542411804, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.2053571492433548, "rewards/curriculum_aware_reward_fn/std": 0.40441393852233887, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 966.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 317.73663330078125, "completions/mean_terminated_length": 317.73663330078125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.7261474987106756, "grad_norm": 0.2893316149711609, "kl": 0.0234375, "learning_rate": 1e-06, "loss": 0.0109, "num_tokens": 75481349.0, "reward": 1.2299107313156128, "reward_std": 0.2687956392765045, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.2321428507566452, "rewards/curriculum_aware_reward_fn/std": 0.4226716458797455, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1412.0, "completions/max_terminated_length": 1412.0, "completions/mean_length": 314.5, "completions/mean_terminated_length": 314.5, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.7302733367715317, "grad_norm": 0.24689827859401703, "kl": 0.021942138671875, "learning_rate": 1e-06, "loss": -0.0046, "num_tokens": 75890625.0, "reward": 1.2075893878936768, "reward_std": 0.1872720867395401, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843171834946, "rewards/curriculum_aware_reward_fn/mean": 0.2142857164144516, "rewards/curriculum_aware_reward_fn/std": 0.41078460216522217, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 857.0, "completions/max_terminated_length": 857.0, "completions/mean_length": 300.265625, "completions/mean_terminated_length": 300.265625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.7343991748323878, "grad_norm": 0.29448026418685913, "kl": 0.0265350341796875, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 76305536.0, "reward": 1.1986607313156128, "reward_std": 0.23206262290477753, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.203125, "rewards/curriculum_aware_reward_fn/std": 0.4027745723724365, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 909.0, "completions/max_terminated_length": 909.0, "completions/mean_length": 305.0, "completions/mean_terminated_length": 305.0, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.7385250128932439, "grad_norm": 0.302929550409317, "kl": 0.0273590087890625, "learning_rate": 1e-06, "loss": 0.0221, "num_tokens": 76691257.0, "reward": 1.2566964626312256, "reward_std": 0.26373592019081116, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.2633928656578064, "rewards/curriculum_aware_reward_fn/std": 0.44096609950065613, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1062.0, "completions/max_terminated_length": 1062.0, "completions/mean_length": 339.8348388671875, "completions/mean_terminated_length": 339.8348388671875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.7426508509541001, "grad_norm": 0.277395099401474, "kl": 0.0259552001953125, "learning_rate": 1e-06, "loss": 0.001, "num_tokens": 77103189.0, "reward": 1.1830357313156128, "reward_std": 0.2289111316204071, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.1897321492433548, "rewards/curriculum_aware_reward_fn/std": 0.39252743124961853, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 871.0, "completions/max_terminated_length": 871.0, "completions/mean_length": 339.46875, "completions/mean_terminated_length": 339.46875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.7467766890149562, "grad_norm": 0.31066209077835083, "kl": 0.0268096923828125, "learning_rate": 1e-06, "loss": 0.0076, "num_tokens": 77536825.0, "reward": 1.2299107313156128, "reward_std": 0.29481977224349976, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.2410714328289032, "rewards/curriculum_aware_reward_fn/std": 0.42821168899536133, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 905.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 339.0625, "completions/mean_terminated_length": 339.0625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.7509025270758123, "grad_norm": 0.2716295123100281, "kl": 0.0228118896484375, "learning_rate": 1e-06, "loss": 0.0125, "num_tokens": 77943889.0, "reward": 1.212053656578064, "reward_std": 0.23437470197677612, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843171834946, "rewards/curriculum_aware_reward_fn/mean": 0.21875, "rewards/curriculum_aware_reward_fn/std": 0.4138607978820801, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 723.0, "completions/max_terminated_length": 723.0, "completions/mean_length": 312.56475830078125, "completions/mean_terminated_length": 312.56475830078125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.7550283651366684, "grad_norm": 0.24832464754581451, "kl": 0.022735595703125, "learning_rate": 1e-06, "loss": -0.008, "num_tokens": 78343345.0, "reward": 1.1763393878936768, "reward_std": 0.19592319428920746, "rewards/code_format_reward/mean": 1.0, "rewards/code_format_reward/std": 0.0, "rewards/curriculum_aware_reward_fn/mean": 0.1763392835855484, "rewards/curriculum_aware_reward_fn/std": 0.3815346360206604, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1319.0, "completions/max_terminated_length": 1319.0, "completions/mean_length": 329.2321472167969, "completions/mean_terminated_length": 329.2321472167969, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.7591542031975245, "grad_norm": 0.26172786951065063, "kl": 0.0237579345703125, "learning_rate": 1e-06, "loss": 0.0054, "num_tokens": 78775524.0, "reward": 1.1763393878936768, "reward_std": 0.2069976031780243, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843171834946, "rewards/curriculum_aware_reward_fn/mean": 0.1830357164144516, "rewards/curriculum_aware_reward_fn/std": 0.387128084897995, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1513.0, "completions/max_terminated_length": 1513.0, "completions/mean_length": 330.6852722167969, "completions/mean_terminated_length": 330.6852722167969, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.7632800412583806, "grad_norm": 0.31728121638298035, "kl": 0.025299072265625, "learning_rate": 1e-06, "loss": 0.0127, "num_tokens": 79193500.0, "reward": 1.2299107313156128, "reward_std": 0.29063916206359863, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349845170975, "rewards/curriculum_aware_reward_fn/mean": 0.2388392835855484, "rewards/curriculum_aware_reward_fn/std": 0.4268510341644287, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1064.0, "completions/max_terminated_length": 1064.0, "completions/mean_length": 306.546875, "completions/mean_terminated_length": 306.546875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.7674058793192368, "grad_norm": 0.2904778718948364, "kl": 0.0263671875, "learning_rate": 1e-06, "loss": 0.0191, "num_tokens": 79590855.0, "reward": 1.3258929252624512, "reward_std": 0.26505059003829956, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843171834946, "rewards/curriculum_aware_reward_fn/mean": 0.3325892984867096, "rewards/curriculum_aware_reward_fn/std": 0.4948147237300873, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 924.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 327.28350830078125, "completions/mean_terminated_length": 327.28350830078125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.7715317173800929, "grad_norm": 0.25731411576271057, "kl": 0.022735595703125, "learning_rate": 1e-06, "loss": 0.0157, "num_tokens": 80005039.0, "reward": 1.15625, "reward_std": 0.21958614885807037, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.1629464328289032, "rewards/curriculum_aware_reward_fn/std": 0.3697296679019928, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1516.0, "completions/max_terminated_length": 1516.0, "completions/mean_length": 327.99554443359375, "completions/mean_terminated_length": 327.99554443359375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.7756575554409489, "grad_norm": 0.2938048243522644, "kl": 0.024993896484375, "learning_rate": 1e-06, "loss": 0.0078, "num_tokens": 80414810.0, "reward": 1.28125, "reward_std": 0.27645939588546753, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843171834946, "rewards/curriculum_aware_reward_fn/mean": 0.2879464328289032, "rewards/curriculum_aware_reward_fn/std": 0.4533121883869171, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 991.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 332.0870666503906, "completions/mean_terminated_length": 332.0870666503906, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.779783393501805, "grad_norm": 0.29818570613861084, "kl": 0.02227783203125, "learning_rate": 1e-06, "loss": 0.011, "num_tokens": 80821675.0, "reward": 1.2522321939468384, "reward_std": 0.26822447776794434, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843171834946, "rewards/curriculum_aware_reward_fn/mean": 0.2589285671710968, "rewards/curriculum_aware_reward_fn/std": 0.43853601813316345, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 913.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 311.3817138671875, "completions/mean_terminated_length": 311.3817138671875, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.7839092315626611, "grad_norm": 0.2672550678253174, "kl": 0.02789306640625, "learning_rate": 1e-06, "loss": 0.0134, "num_tokens": 81213909.0, "reward": 1.2075893878936768, "reward_std": 0.23625390231609344, "rewards/code_format_reward/mean": 0.984375, "rewards/code_format_reward/std": 0.12415824085474014, "rewards/curriculum_aware_reward_fn/mean": 0.2232142835855484, "rewards/curriculum_aware_reward_fn/std": 0.42746472358703613, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1225.0, "completions/max_terminated_length": 1225.0, "completions/mean_length": 323.859375, "completions/mean_terminated_length": 323.859375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.7880350696235173, "grad_norm": 0.28063666820526123, "kl": 0.0228729248046875, "learning_rate": 1e-06, "loss": 0.0277, "num_tokens": 81608128.0, "reward": 1.296875, "reward_std": 0.2625458836555481, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.3035714328289032, "rewards/curriculum_aware_reward_fn/std": 0.46031373739242554, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1101.0, "completions/max_terminated_length": 1101.0, "completions/mean_length": 339.80804443359375, "completions/mean_terminated_length": 339.80804443359375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.7921609076843734, "grad_norm": 0.2821204662322998, "kl": 0.0235748291015625, "learning_rate": 1e-06, "loss": 0.0263, "num_tokens": 82053375.0, "reward": 1.171875, "reward_std": 0.23261132836341858, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.1808035671710968, "rewards/curriculum_aware_reward_fn/std": 0.3852855861186981, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 778.0, "completions/max_terminated_length": 778.0, "completions/mean_length": 319.71875, "completions/mean_terminated_length": 319.71875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.7962867457452295, "grad_norm": 0.29887884855270386, "kl": 0.0264129638671875, "learning_rate": 1e-06, "loss": 0.0009, "num_tokens": 82469994.0, "reward": 1.2455357313156128, "reward_std": 0.22913821041584015, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.25, "rewards/curriculum_aware_reward_fn/std": 0.43349677324295044, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 873.0, "completions/max_terminated_length": 873.0, "completions/mean_length": 317.40850830078125, "completions/mean_terminated_length": 317.40850830078125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.8004125838060856, "grad_norm": 0.29472821950912476, "kl": 0.0242919921875, "learning_rate": 1e-06, "loss": 0.0166, "num_tokens": 82876583.0, "reward": 1.2678571939468384, "reward_std": 0.26547685265541077, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.2700892984867096, "rewards/curriculum_aware_reward_fn/std": 0.444502055644989, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1429.0, "completions/max_terminated_length": 1429.0, "completions/mean_length": 328.8794860839844, "completions/mean_terminated_length": 328.8794860839844, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.8045384218669417, "grad_norm": 0.2741622030735016, "kl": 0.0247344970703125, "learning_rate": 1e-06, "loss": 0.0116, "num_tokens": 83282704.0, "reward": 1.2075893878936768, "reward_std": 0.2490561157464981, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.2165178507566452, "rewards/curriculum_aware_reward_fn/std": 0.41233164072036743, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 823.0, "completions/max_terminated_length": 823.0, "completions/mean_length": 325.9442138671875, "completions/mean_terminated_length": 325.9442138671875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.8086642599277978, "grad_norm": 0.28116053342819214, "kl": 0.024322509765625, "learning_rate": 1e-06, "loss": -0.0005, "num_tokens": 83700215.0, "reward": 1.1763393878936768, "reward_std": 0.23797489702701569, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.1830357164144516, "rewards/curriculum_aware_reward_fn/std": 0.387128084897995, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1386.0, "completions/max_terminated_length": 1386.0, "completions/mean_length": 333.875, "completions/mean_terminated_length": 333.875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.812790097988654, "grad_norm": 0.2228228598833084, "kl": 0.0242156982421875, "learning_rate": 1e-06, "loss": 0.0167, "num_tokens": 84116820.0, "reward": 1.1852679252624512, "reward_std": 0.2046811729669571, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.1875, "rewards/curriculum_aware_reward_fn/std": 0.4075627326965332, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 927.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 315.8571472167969, "completions/mean_terminated_length": 315.8571472167969, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.8169159360495101, "grad_norm": 0.26287922263145447, "kl": 0.0230560302734375, "learning_rate": 1e-06, "loss": 0.0119, "num_tokens": 84512193.0, "reward": 1.2477679252624512, "reward_std": 0.21247047185897827, "rewards/code_format_reward/mean": 1.0, "rewards/code_format_reward/std": 0.0, "rewards/curriculum_aware_reward_fn/mean": 0.2477678507566452, "rewards/curriculum_aware_reward_fn/std": 0.4321989119052887, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 823.0, "completions/mean_length": 305.0401916503906, "completions/mean_terminated_length": 296.5592956542969, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.8210417741103662, "grad_norm": 0.3131493330001831, "kl": 0.02862548828125, "learning_rate": 1e-06, "loss": 0.028, "num_tokens": 84907991.0, "reward": 1.2566964626312256, "reward_std": 0.2617596387863159, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.2633928656578064, "rewards/curriculum_aware_reward_fn/std": 0.44096609950065613, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 1146.0, "completions/mean_length": 323.9508972167969, "completions/mean_terminated_length": 315.5122985839844, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.8251676121712223, "grad_norm": 0.3015516400337219, "kl": 0.0243072509765625, "learning_rate": 1e-06, "loss": 0.0317, "num_tokens": 85335081.0, "reward": 1.1897321939468384, "reward_std": 0.2864902913570404, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.1919642835855484, "rewards/curriculum_aware_reward_fn/std": 0.4054744839668274, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 941.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 315.984375, "completions/mean_terminated_length": 315.984375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.8292934502320783, "grad_norm": 0.24153034389019012, "kl": 0.0271759033203125, "learning_rate": 1e-06, "loss": 0.0076, "num_tokens": 85745344.0, "reward": 1.1517857313156128, "reward_std": 0.17364409565925598, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.1584821492433548, "rewards/curriculum_aware_reward_fn/std": 0.36560073494911194, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 863.0, "completions/max_terminated_length": 863.0, "completions/mean_length": 310.6785888671875, "completions/mean_terminated_length": 310.6785888671875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.8334192882929345, "grad_norm": 0.26400184631347656, "kl": 0.0273895263671875, "learning_rate": 1e-06, "loss": 0.0271, "num_tokens": 86143745.0, "reward": 1.165178656578064, "reward_std": 0.2323482185602188, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.1696428507566452, "rewards/curriculum_aware_reward_fn/std": 0.37573832273483276, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1204.0, "completions/max_terminated_length": 1204.0, "completions/mean_length": 323.82366943359375, "completions/mean_terminated_length": 323.82366943359375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.8375451263537906, "grad_norm": 0.2793115973472595, "kl": 0.02593994140625, "learning_rate": 1e-06, "loss": -0.0108, "num_tokens": 86570820.0, "reward": 1.133928656578064, "reward_std": 0.18545547127723694, "rewards/code_format_reward/mean": 1.0, "rewards/code_format_reward/std": 0.0, "rewards/curriculum_aware_reward_fn/mean": 0.1339285671710968, "rewards/curriculum_aware_reward_fn/std": 0.34095630049705505, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1691.0, "completions/max_terminated_length": 1691.0, "completions/mean_length": 306.9196472167969, "completions/mean_terminated_length": 306.9196472167969, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.8416709644146467, "grad_norm": 0.28388360142707825, "kl": 0.026458740234375, "learning_rate": 1e-06, "loss": 0.0117, "num_tokens": 86973151.0, "reward": 1.2232143878936768, "reward_std": 0.22128692269325256, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.2276785671710968, "rewards/curriculum_aware_reward_fn/std": 0.41980284452438354, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 835.0, "completions/max_terminated_length": 835.0, "completions/mean_length": 299.24554443359375, "completions/mean_terminated_length": 299.24554443359375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.8457968024755028, "grad_norm": 0.31298768520355225, "kl": 0.0265960693359375, "learning_rate": 1e-06, "loss": 0.0011, "num_tokens": 87371995.0, "reward": 1.2633929252624512, "reward_std": 0.28636130690574646, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.265625, "rewards/curriculum_aware_reward_fn/std": 0.4570859372615814, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2246.0, "completions/max_terminated_length": 2246.0, "completions/mean_length": 316.68975830078125, "completions/mean_terminated_length": 316.68975830078125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.8499226405363589, "grad_norm": 0.25633522868156433, "kl": 0.0277252197265625, "learning_rate": 1e-06, "loss": 0.0287, "num_tokens": 87774986.0, "reward": 1.2142857313156128, "reward_std": 0.23026974499225616, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.2254464328289032, "rewards/curriculum_aware_reward_fn/std": 0.41834309697151184, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 302.9576110839844, "completions/mean_terminated_length": 302.9576110839844, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.8540484785972151, "grad_norm": 0.2657616138458252, "kl": 0.03033447265625, "learning_rate": 1e-06, "loss": 0.019, "num_tokens": 88167429.0, "reward": 1.2678571939468384, "reward_std": 0.2246430367231369, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.2700892984867096, "rewards/curriculum_aware_reward_fn/std": 0.444502055644989, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3090.0, "completions/max_terminated_length": 3090.0, "completions/mean_length": 321.8214416503906, "completions/mean_terminated_length": 321.8214416503906, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.8581743166580712, "grad_norm": 0.2790772616863251, "kl": 0.025634765625, "learning_rate": 1e-06, "loss": 0.0213, "num_tokens": 88572817.0, "reward": 1.2410715818405151, "reward_std": 0.2528951168060303, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.25, "rewards/curriculum_aware_reward_fn/std": 0.43349677324295044, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1048.0, "completions/max_terminated_length": 1048.0, "completions/mean_length": 311.828125, "completions/mean_terminated_length": 311.828125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.8623001547189273, "grad_norm": 0.2750173509120941, "kl": 0.02984619140625, "learning_rate": 1e-06, "loss": 0.0018, "num_tokens": 88991396.0, "reward": 1.21875, "reward_std": 0.23900403082370758, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.2232142835855484, "rewards/curriculum_aware_reward_fn/std": 0.41686636209487915, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 910.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 312.26788330078125, "completions/mean_terminated_length": 312.26788330078125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.8664259927797834, "grad_norm": 0.30650582909584045, "kl": 0.029510498046875, "learning_rate": 1e-06, "loss": 0.0176, "num_tokens": 89397985.0, "reward": 1.2366071939468384, "reward_std": 0.26114898920059204, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.2388392835855484, "rewards/curriculum_aware_reward_fn/std": 0.4372074007987976, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 814.0, "completions/max_terminated_length": 814.0, "completions/mean_length": 298.5535888671875, "completions/mean_terminated_length": 298.5535888671875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.8705518308406395, "grad_norm": 0.31274229288101196, "kl": 0.0353546142578125, "learning_rate": 1e-06, "loss": -0.0005, "num_tokens": 89786467.0, "reward": 1.2232143878936768, "reward_std": 0.29923391342163086, "rewards/code_format_reward/mean": 0.9821428656578064, "rewards/code_format_reward/std": 0.13258016109466553, "rewards/curriculum_aware_reward_fn/mean": 0.2410714328289032, "rewards/curriculum_aware_reward_fn/std": 0.43853598833084106, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 922.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 315.79913330078125, "completions/mean_terminated_length": 315.79913330078125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.8746776689014956, "grad_norm": 0.256560355424881, "kl": 0.02703857421875, "learning_rate": 1e-06, "loss": 0.014, "num_tokens": 90191460.0, "reward": 1.2299107313156128, "reward_std": 0.1991419643163681, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.2321428507566452, "rewards/curriculum_aware_reward_fn/std": 0.4226716458797455, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1228.0, "completions/max_terminated_length": 1228.0, "completions/mean_length": 326.89288330078125, "completions/mean_terminated_length": 326.89288330078125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.8788035069623518, "grad_norm": 0.2637675106525421, "kl": 0.0247344970703125, "learning_rate": 1e-06, "loss": 0.0246, "num_tokens": 90609809.0, "reward": 1.2209821939468384, "reward_std": 0.23653537034988403, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.2232142835855484, "rewards/curriculum_aware_reward_fn/std": 0.41686636209487915, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 864.0, "completions/max_terminated_length": 864.0, "completions/mean_length": 336.1540222167969, "completions/mean_terminated_length": 336.1540222167969, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.8829293450232079, "grad_norm": 0.30552083253860474, "kl": 0.02789306640625, "learning_rate": 1e-06, "loss": 0.0137, "num_tokens": 91040511.0, "reward": 1.2566964626312256, "reward_std": 0.3050508499145508, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843171834946, "rewards/curriculum_aware_reward_fn/mean": 0.2633928656578064, "rewards/curriculum_aware_reward_fn/std": 0.44096609950065613, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 878.0, "completions/max_terminated_length": 878.0, "completions/mean_length": 319.6629638671875, "completions/mean_terminated_length": 319.6629638671875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.887055183084064, "grad_norm": 0.24829484522342682, "kl": 0.0282745361328125, "learning_rate": 1e-06, "loss": 0.0203, "num_tokens": 91442902.0, "reward": 1.2924107313156128, "reward_std": 0.17238543927669525, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.2946428656578064, "rewards/curriculum_aware_reward_fn/std": 0.45639169216156006, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1041.0, "completions/max_terminated_length": 1041.0, "completions/mean_length": 322.52679443359375, "completions/mean_terminated_length": 322.52679443359375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.89118102114492, "grad_norm": 0.27576252818107605, "kl": 0.0244140625, "learning_rate": 1e-06, "loss": 0.0226, "num_tokens": 91856558.0, "reward": 1.25, "reward_std": 0.23481683433055878, "rewards/code_format_reward/mean": 1.0, "rewards/code_format_reward/std": 0.0, "rewards/curriculum_aware_reward_fn/mean": 0.25, "rewards/curriculum_aware_reward_fn/std": 0.44369807839393616, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1190.0, "completions/max_terminated_length": 1190.0, "completions/mean_length": 311.1808166503906, "completions/mean_terminated_length": 311.1808166503906, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.8953068592057761, "grad_norm": 0.2690725326538086, "kl": 0.0276031494140625, "learning_rate": 1e-06, "loss": 0.0178, "num_tokens": 92258089.0, "reward": 1.2075893878936768, "reward_std": 0.2252287119626999, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843171834946, "rewards/curriculum_aware_reward_fn/mean": 0.2142857164144516, "rewards/curriculum_aware_reward_fn/std": 0.4107845723628998, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1339.0, "completions/max_terminated_length": 1339.0, "completions/mean_length": 299.5692138671875, "completions/mean_terminated_length": 299.5692138671875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.8994326972666323, "grad_norm": 0.25958672165870667, "kl": 0.0254058837890625, "learning_rate": 1e-06, "loss": 0.003, "num_tokens": 92649693.0, "reward": 1.2254464626312256, "reward_std": 0.20484624803066254, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.2276785671710968, "rewards/curriculum_aware_reward_fn/std": 0.4250984489917755, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 880.0, "completions/max_terminated_length": 880.0, "completions/mean_length": 308.4419860839844, "completions/mean_terminated_length": 308.4419860839844, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.9035585353274884, "grad_norm": 0.24980401992797852, "kl": 0.02593994140625, "learning_rate": 1e-06, "loss": -0.008, "num_tokens": 93058222.0, "reward": 1.2232143878936768, "reward_std": 0.19477578997612, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.2254464328289032, "rewards/curriculum_aware_reward_fn/std": 0.41834309697151184, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1255.0, "completions/max_terminated_length": 1255.0, "completions/mean_length": 315.14288330078125, "completions/mean_terminated_length": 315.14288330078125, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.9076843733883445, "grad_norm": 0.25699254870414734, "kl": 0.0230255126953125, "learning_rate": 1e-06, "loss": 0.0155, "num_tokens": 93454059.0, "reward": 1.2410714626312256, "reward_std": 0.1934994012117386, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.2433035671710968, "rewards/curriculum_aware_reward_fn/std": 0.4295565187931061, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1090.0, "completions/max_terminated_length": 1090.0, "completions/mean_length": 305.890625, "completions/mean_terminated_length": 305.890625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.9118102114492006, "grad_norm": 0.2736288011074066, "kl": 0.0252532958984375, "learning_rate": 1e-06, "loss": 0.0047, "num_tokens": 93842818.0, "reward": 1.2477679252624512, "reward_std": 0.22172905504703522, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.25, "rewards/curriculum_aware_reward_fn/std": 0.43349677324295044, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 943.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 321.24554443359375, "completions/mean_terminated_length": 321.24554443359375, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.9159360495100567, "grad_norm": 0.266389936208725, "kl": 0.02783203125, "learning_rate": 1e-06, "loss": 0.0129, "num_tokens": 94250216.0, "reward": 1.1941964626312256, "reward_std": 0.23208504915237427, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843171834946, "rewards/curriculum_aware_reward_fn/mean": 0.2008928507566452, "rewards/curriculum_aware_reward_fn/std": 0.4011160135269165, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1260.0, "completions/max_terminated_length": 1260.0, "completions/mean_length": 345.79241943359375, "completions/mean_terminated_length": 345.79241943359375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.9200618875709129, "grad_norm": 0.24590951204299927, "kl": 0.025299072265625, "learning_rate": 1e-06, "loss": 0.0089, "num_tokens": 94670326.0, "reward": 1.2388393878936768, "reward_std": 0.23360216617584229, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.2410714328289032, "rewards/curriculum_aware_reward_fn/std": 0.4486227333545685, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1585.0, "completions/max_terminated_length": 1585.0, "completions/mean_length": 352.33929443359375, "completions/mean_terminated_length": 352.33929443359375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.924187725631769, "grad_norm": 0.2685246169567108, "kl": 0.025054931640625, "learning_rate": 1e-06, "loss": 0.0154, "num_tokens": 95106060.0, "reward": 1.2477679252624512, "reward_std": 0.24343277513980865, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843171834946, "rewards/curriculum_aware_reward_fn/mean": 0.2544642984867096, "rewards/curriculum_aware_reward_fn/std": 0.4360465705394745, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1119.0, "completions/max_terminated_length": 1119.0, "completions/mean_length": 333.7567138671875, "completions/mean_terminated_length": 333.7567138671875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.9283135636926251, "grad_norm": 0.27416351437568665, "kl": 0.025299072265625, "learning_rate": 1e-06, "loss": 0.0119, "num_tokens": 95524201.0, "reward": 1.2142857313156128, "reward_std": 0.23922230303287506, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.2165178507566452, "rewards/curriculum_aware_reward_fn/std": 0.41233164072036743, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 995.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 330.85491943359375, "completions/mean_terminated_length": 330.85491943359375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.9324394017534812, "grad_norm": 0.24719306826591492, "kl": 0.0243988037109375, "learning_rate": 1e-06, "loss": 0.0204, "num_tokens": 95936009.0, "reward": 1.227678656578064, "reward_std": 0.20834526419639587, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.234375, "rewards/curriculum_aware_reward_fn/std": 0.42408111691474915, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 1316.0, "completions/mean_length": 336.1383972167969, "completions/mean_terminated_length": 327.7270812988281, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.9365652398143373, "grad_norm": 0.2825274169445038, "kl": 0.0266876220703125, "learning_rate": 1e-06, "loss": 0.0083, "num_tokens": 96349232.0, "reward": 1.21875, "reward_std": 0.23285290598869324, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.2276785671710968, "rewards/curriculum_aware_reward_fn/std": 0.41980284452438354, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 977.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 342.8995666503906, "completions/mean_terminated_length": 342.8995666503906, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.9406910778751933, "grad_norm": 0.2726384401321411, "kl": 0.02545166015625, "learning_rate": 1e-06, "loss": 0.0059, "num_tokens": 96767934.0, "reward": 1.2477679252624512, "reward_std": 0.23435229063034058, "rewards/code_format_reward/mean": 1.0, "rewards/code_format_reward/std": 0.0, "rewards/curriculum_aware_reward_fn/mean": 0.2477678507566452, "rewards/curriculum_aware_reward_fn/std": 0.4321989119052887, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1114.0, "completions/max_terminated_length": 1114.0, "completions/mean_length": 353.0602722167969, "completions/mean_terminated_length": 353.0602722167969, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.9448169159360496, "grad_norm": 0.25261878967285156, "kl": 0.030792236328125, "learning_rate": 1e-06, "loss": 0.008, "num_tokens": 97207204.0, "reward": 1.227678656578064, "reward_std": 0.2187439203262329, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.2366071492433548, "rewards/curriculum_aware_reward_fn/std": 0.4254741966724396, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1551.0, "completions/max_terminated_length": 1551.0, "completions/mean_length": 332.21875, "completions/mean_terminated_length": 332.21875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.9489427539969056, "grad_norm": 0.2988578975200653, "kl": 0.02734375, "learning_rate": 1e-06, "loss": 0.0064, "num_tokens": 97625440.0, "reward": 1.2723214626312256, "reward_std": 0.2795713245868683, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.2767857015132904, "rewards/curriculum_aware_reward_fn/std": 0.44790980219841003, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 828.0, "completions/max_terminated_length": 828.0, "completions/mean_length": 337.97100830078125, "completions/mean_terminated_length": 337.97100830078125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.9530685920577617, "grad_norm": 0.3103151023387909, "kl": 0.025634765625, "learning_rate": 1e-06, "loss": 0.0107, "num_tokens": 98038483.0, "reward": 1.2410714626312256, "reward_std": 0.2781383693218231, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.25, "rewards/curriculum_aware_reward_fn/std": 0.43349677324295044, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 909.0, "completions/mean_length": 323.9888610839844, "completions/mean_terminated_length": 315.5503234863281, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.9571944301186178, "grad_norm": 0.3021485209465027, "kl": 0.025390625, "learning_rate": 1e-06, "loss": 0.0381, "num_tokens": 98450337.0, "reward": 1.2633929252624512, "reward_std": 0.27397364377975464, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843171834946, "rewards/curriculum_aware_reward_fn/mean": 0.2700892984867096, "rewards/curriculum_aware_reward_fn/std": 0.444502055644989, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1302.0, "completions/max_terminated_length": 1302.0, "completions/mean_length": 340.8839416503906, "completions/mean_terminated_length": 340.8839416503906, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.9613202681794739, "grad_norm": 0.27986449003219604, "kl": 0.02471923828125, "learning_rate": 1e-06, "loss": 0.0207, "num_tokens": 98870852.0, "reward": 1.21875, "reward_std": 0.2478822022676468, "rewards/code_format_reward/mean": 1.0, "rewards/code_format_reward/std": 0.0, "rewards/curriculum_aware_reward_fn/mean": 0.21875, "rewards/curriculum_aware_reward_fn/std": 0.4138607978820801, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1932.0, "completions/max_terminated_length": 1932.0, "completions/mean_length": 322.99554443359375, "completions/mean_terminated_length": 322.99554443359375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.9654461062403301, "grad_norm": 0.2800370752811432, "kl": 0.02777099609375, "learning_rate": 1e-06, "loss": 0.0022, "num_tokens": 99287511.0, "reward": 1.1919643878936768, "reward_std": 0.20504765212535858, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.1964285671710968, "rewards/curriculum_aware_reward_fn/std": 0.39774051308631897, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 960.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 327.33929443359375, "completions/mean_terminated_length": 327.33929443359375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.9695719443011862, "grad_norm": 0.30059149861335754, "kl": 0.02850341796875, "learning_rate": 1e-06, "loss": -0.0023, "num_tokens": 99705129.0, "reward": 1.305803656578064, "reward_std": 0.2940920889377594, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.3147321343421936, "rewards/curriculum_aware_reward_fn/std": 0.47445425391197205, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 832.0, "completions/max_terminated_length": 832.0, "completions/mean_length": 312.1071472167969, "completions/mean_terminated_length": 312.1071472167969, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.9736977823620423, "grad_norm": 0.30200207233428955, "kl": 0.028228759765625, "learning_rate": 1e-06, "loss": 0.0135, "num_tokens": 100119409.0, "reward": 1.2053571939468384, "reward_std": 0.2535247504711151, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.2120535671710968, "rewards/curriculum_aware_reward_fn/std": 0.40921953320503235, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 306.890625, "completions/mean_terminated_length": 306.890625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.9778236204228984, "grad_norm": 0.3258730471134186, "kl": 0.0380859375, "learning_rate": 1e-06, "loss": 0.0184, "num_tokens": 100513180.0, "reward": 1.28125, "reward_std": 0.2918248772621155, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.2857142984867096, "rewards/curriculum_aware_reward_fn/std": 0.45225897431373596, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 861.0, "completions/max_terminated_length": 861.0, "completions/mean_length": 304.640625, "completions/mean_terminated_length": 304.640625, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.9819494584837545, "grad_norm": 0.29615044593811035, "kl": 0.0279693603515625, "learning_rate": 1e-06, "loss": 0.0156, "num_tokens": 100925613.0, "reward": 1.212053656578064, "reward_std": 0.2311783730983734, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.2165178507566452, "rewards/curriculum_aware_reward_fn/std": 0.4230436384677887, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1572.0, "completions/max_terminated_length": 1572.0, "completions/mean_length": 317.7901916503906, "completions/mean_terminated_length": 317.7901916503906, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.9860752965446106, "grad_norm": 0.28646448254585266, "kl": 0.029632568359375, "learning_rate": 1e-06, "loss": -0.0035, "num_tokens": 101335236.0, "reward": 1.1852679252624512, "reward_std": 0.2169070988893509, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.1964285671710968, "rewards/curriculum_aware_reward_fn/std": 0.3977404832839966, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 831.0, "completions/max_terminated_length": 831.0, "completions/mean_length": 313.8058166503906, "completions/mean_terminated_length": 313.8058166503906, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.9902011346054668, "grad_norm": 0.2968890964984894, "kl": 0.030731201171875, "learning_rate": 1e-06, "loss": 0.0046, "num_tokens": 101748576.0, "reward": 1.2566964626312256, "reward_std": 0.26950088143348694, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.2589285671710968, "rewards/curriculum_aware_reward_fn/std": 0.43853598833084106, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1329.0, "completions/max_terminated_length": 1329.0, "completions/mean_length": 314.5357360839844, "completions/mean_terminated_length": 314.5357360839844, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.9943269726663229, "grad_norm": 0.278796523809433, "kl": 0.03369140625, "learning_rate": 1e-06, "loss": 0.0039, "num_tokens": 102165778.0, "reward": 1.265625, "reward_std": 0.23338386416435242, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.2700892984867096, "rewards/curriculum_aware_reward_fn/std": 0.444502055644989, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 786.0, "completions/max_terminated_length": 786.0, "completions/mean_length": 304.0802917480469, "completions/mean_terminated_length": 304.0802917480469, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.998452810727179, "grad_norm": 0.28702470660209656, "kl": 0.0279998779296875, "learning_rate": 1e-06, "loss": 0.0054, "num_tokens": 102575735.0, "reward": 1.2254464626312256, "reward_std": 0.22519510984420776, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.2321428507566452, "rewards/curriculum_aware_reward_fn/std": 0.4226716458797455, "step": 242 }, { "epoch": 0.998452810727179, "step": 242, "total_flos": 0.0, "train_loss": 0.01385925530842895, "train_runtime": 22451.8912, "train_samples_per_second": 0.691, "train_steps_per_second": 0.011 } ], "logging_steps": 1, "max_steps": 242, "num_input_tokens_seen": 102575735, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }