{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.998452810727179, "eval_steps": 500, "global_step": 242, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2703.0, "completions/max_terminated_length": 2703.0, "completions/mean_length": 485.69866943359375, "completions/mean_terminated_length": 485.69866943359375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.0041258380608561115, "grad_norm": 0.22616177797317505, "kl": 0.0, "learning_rate": 1e-06, "loss": 0.0035, "num_tokens": 484412.0, "reward": 0.1071428656578064, "reward_std": 0.166747584939003, "rewards/code_format_reward/mean": 0.0446428582072258, "rewards/code_format_reward/std": 0.2067493349313736, "rewards/curriculum_aware_reward_fn/mean": 0.0625, "rewards/curriculum_aware_reward_fn/std": 0.24233205616474152, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3024.0, "completions/max_terminated_length": 3024.0, "completions/mean_length": 476.8906555175781, "completions/mean_terminated_length": 476.8906555175781, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.008251676121712223, "grad_norm": 0.24903689324855804, "kl": 0.00027942657470703125, "learning_rate": 1e-06, "loss": 0.0179, "num_tokens": 947399.0, "reward": 0.1071428656578064, "reward_std": 0.18021897971630096, "rewards/code_format_reward/mean": 0.0558035708963871, "rewards/code_format_reward/std": 0.22979861497879028, "rewards/curriculum_aware_reward_fn/mean": 0.0513392873108387, "rewards/curriculum_aware_reward_fn/std": 0.22093553841114044, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 1871.0, "completions/mean_length": 500.8951110839844, "completions/mean_terminated_length": 476.6584167480469, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.012377514182568335, "grad_norm": 0.28932124376296997, "kl": 0.0003249645233154297, "learning_rate": 1e-06, "loss": 0.0449, "num_tokens": 1424199.0, "reward": 0.1406250149011612, "reward_std": 0.23391105234622955, "rewards/code_format_reward/mean": 0.0691964253783226, "rewards/code_format_reward/std": 0.25407159328460693, "rewards/curriculum_aware_reward_fn/mean": 0.0714285746216774, "rewards/curriculum_aware_reward_fn/std": 0.2578272819519043, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 1671.0, "completions/mean_length": 508.4062805175781, "completions/mean_terminated_length": 500.38031005859375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.016503352243424446, "grad_norm": 0.3046747148036957, "kl": 0.00036406517028808594, "learning_rate": 1e-06, "loss": 0.0102, "num_tokens": 1931986.0, "reward": 0.1160714328289032, "reward_std": 0.22505438327789307, "rewards/code_format_reward/mean": 0.0848214253783226, "rewards/code_format_reward/std": 0.2789272665977478, "rewards/curriculum_aware_reward_fn/mean": 0.03125, "rewards/curriculum_aware_reward_fn/std": 0.17418713867664337, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2961.0, "completions/max_terminated_length": 2961.0, "completions/mean_length": 530.466552734375, "completions/mean_terminated_length": 530.466552734375, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.020629190304280558, "grad_norm": 0.37298938632011414, "kl": 0.0006275177001953125, "learning_rate": 1e-06, "loss": 0.0206, "num_tokens": 2448578.0, "reward": 0.1986607164144516, "reward_std": 0.3610951602458954, "rewards/code_format_reward/mean": 0.1584821492433548, "rewards/code_format_reward/std": 0.36560073494911194, "rewards/curriculum_aware_reward_fn/mean": 0.0401785708963871, "rewards/curriculum_aware_reward_fn/std": 0.2076651155948639, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1797.0, "completions/max_terminated_length": 1797.0, "completions/mean_length": 454.3370666503906, "completions/mean_terminated_length": 454.3370666503906, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.02475502836513667, "grad_norm": 0.48693978786468506, "kl": 0.0013666152954101562, "learning_rate": 1e-06, "loss": 0.0372, "num_tokens": 2927424.0, "reward": 0.3236607611179352, "reward_std": 0.4571215510368347, "rewards/code_format_reward/mean": 0.2544642984867096, "rewards/code_format_reward/std": 0.4360465705394745, "rewards/curriculum_aware_reward_fn/mean": 0.0691964253783226, "rewards/curriculum_aware_reward_fn/std": 0.25407159328460693, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3109.0, "completions/max_terminated_length": 3109.0, "completions/mean_length": 476.78350830078125, "completions/mean_terminated_length": 476.78350830078125, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.02888086642599278, "grad_norm": 0.4472070634365082, "kl": 0.0018758773803710938, "learning_rate": 1e-06, "loss": 0.032, "num_tokens": 3421765.0, "reward": 0.3325892984867096, "reward_std": 0.44534748792648315, "rewards/code_format_reward/mean": 0.2723214328289032, "rewards/code_format_reward/std": 0.4456520676612854, "rewards/curriculum_aware_reward_fn/mean": 0.0602678582072258, "rewards/curriculum_aware_reward_fn/std": 0.23824846744537354, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1396.0, "completions/max_terminated_length": 1396.0, "completions/mean_length": 446.5067138671875, "completions/mean_terminated_length": 446.5067138671875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.03300670448684889, "grad_norm": 0.4557957649230957, "kl": 0.002429962158203125, "learning_rate": 1e-06, "loss": 0.0064, "num_tokens": 3885133.0, "reward": 0.4151786267757416, "reward_std": 0.4811772108078003, "rewards/code_format_reward/mean": 0.359375, "rewards/code_format_reward/std": 0.4803536534309387, "rewards/curriculum_aware_reward_fn/mean": 0.0558035708963871, "rewards/curriculum_aware_reward_fn/std": 0.22979861497879028, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3443.0, "completions/max_terminated_length": 3443.0, "completions/mean_length": 450.9375305175781, "completions/mean_terminated_length": 450.9375305175781, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.037132542547705004, "grad_norm": 0.44398829340934753, "kl": 0.0042476654052734375, "learning_rate": 1e-06, "loss": 0.0161, "num_tokens": 4348652.0, "reward": 0.5468750596046448, "reward_std": 0.5476124286651611, "rewards/code_format_reward/mean": 0.4866071343421936, "rewards/code_format_reward/std": 0.5003793835639954, "rewards/curriculum_aware_reward_fn/mean": 0.0602678582072258, "rewards/curriculum_aware_reward_fn/std": 0.23824848234653473, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1667.0, "completions/max_terminated_length": 1667.0, "completions/mean_length": 422.4754638671875, "completions/mean_terminated_length": 422.4754638671875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.041258380608561115, "grad_norm": 0.4429943859577179, "kl": 0.006931304931640625, "learning_rate": 1e-06, "loss": 0.0169, "num_tokens": 4808366.0, "reward": 0.7008928656578064, "reward_std": 0.5152171850204468, "rewards/code_format_reward/mean": 0.6495535969734192, "rewards/code_format_reward/std": 0.47764313220977783, "rewards/curriculum_aware_reward_fn/mean": 0.0513392873108387, "rewards/curriculum_aware_reward_fn/std": 0.22093555331230164, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1197.0, "completions/max_terminated_length": 1197.0, "completions/mean_length": 405.6004638671875, "completions/mean_terminated_length": 405.6004638671875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.04538421866941723, "grad_norm": 0.4400160610675812, "kl": 0.005626678466796875, "learning_rate": 1e-06, "loss": -0.0079, "num_tokens": 5255123.0, "reward": 0.7812500596046448, "reward_std": 0.44405829906463623, "rewards/code_format_reward/mean": 0.734375, "rewards/code_format_reward/std": 0.44215917587280273, "rewards/curriculum_aware_reward_fn/mean": 0.046875, "rewards/curriculum_aware_reward_fn/std": 0.21160738170146942, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1462.0, "completions/max_terminated_length": 1462.0, "completions/mean_length": 427.71429443359375, "completions/mean_terminated_length": 427.71429443359375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.04951005673027334, "grad_norm": 0.41308656334877014, "kl": 0.0095062255859375, "learning_rate": 1e-06, "loss": -0.0089, "num_tokens": 5714678.0, "reward": 0.7879464626312256, "reward_std": 0.40129104256629944, "rewards/code_format_reward/mean": 0.7589285969734192, "rewards/code_format_reward/std": 0.4282117187976837, "rewards/curriculum_aware_reward_fn/mean": 0.02901785634458065, "rewards/curriculum_aware_reward_fn/std": 0.16804419457912445, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1237.0, "completions/max_terminated_length": 1237.0, "completions/mean_length": 409.65179443359375, "completions/mean_terminated_length": 409.65179443359375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.05363589479112945, "grad_norm": 0.3630838990211487, "kl": 0.007907867431640625, "learning_rate": 1e-06, "loss": -0.0151, "num_tokens": 6166701.0, "reward": 0.8616071939468384, "reward_std": 0.3657563328742981, "rewards/code_format_reward/mean": 0.796875, "rewards/code_format_reward/std": 0.4027745723724365, "rewards/curriculum_aware_reward_fn/mean": 0.0647321417927742, "rewards/curriculum_aware_reward_fn/std": 0.24632768332958221, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2308.0, "completions/max_terminated_length": 2308.0, "completions/mean_length": 406.79913330078125, "completions/mean_terminated_length": 406.79913330078125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.05776173285198556, "grad_norm": 0.4150392711162567, "kl": 0.010379791259765625, "learning_rate": 1e-06, "loss": 0.0252, "num_tokens": 6634050.0, "reward": 0.935267984867096, "reward_std": 0.4047480821609497, "rewards/code_format_reward/mean": 0.828125, "rewards/code_format_reward/std": 0.3776935040950775, "rewards/curriculum_aware_reward_fn/mean": 0.1071428582072258, "rewards/curriculum_aware_reward_fn/std": 0.3096405565738678, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1917.0, "completions/max_terminated_length": 1917.0, "completions/mean_length": 410.2745666503906, "completions/mean_terminated_length": 410.2745666503906, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.06188757091284167, "grad_norm": 0.37965965270996094, "kl": 0.00659942626953125, "learning_rate": 1e-06, "loss": -0.004, "num_tokens": 7072429.0, "reward": 1.03125, "reward_std": 0.32808175683021545, "rewards/code_format_reward/mean": 0.9040178656578064, "rewards/code_format_reward/std": 0.29489606618881226, "rewards/curriculum_aware_reward_fn/mean": 0.1272321492433548, "rewards/curriculum_aware_reward_fn/std": 0.3336053788661957, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1777.0, "completions/max_terminated_length": 1777.0, "completions/mean_length": 426.73663330078125, "completions/mean_terminated_length": 426.73663330078125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.06601340897369778, "grad_norm": 0.3211141526699066, "kl": 0.009593963623046875, "learning_rate": 1e-06, "loss": 0.0163, "num_tokens": 7548199.0, "reward": 0.9419643878936768, "reward_std": 0.3219388723373413, "rewards/code_format_reward/mean": 0.875, "rewards/code_format_reward/std": 0.3310886323451996, "rewards/curriculum_aware_reward_fn/mean": 0.0669642835855484, "rewards/curriculum_aware_reward_fn/std": 0.25023961067199707, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1682.0, "completions/max_terminated_length": 1682.0, "completions/mean_length": 447.3839416503906, "completions/mean_terminated_length": 447.3839416503906, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.07013924703455389, "grad_norm": 0.2893203794956207, "kl": 0.01567840576171875, "learning_rate": 1e-06, "loss": 0.0274, "num_tokens": 8007690.0, "reward": 1.0, "reward_std": 0.21567682921886444, "rewards/code_format_reward/mean": 0.9419642686843872, "rewards/code_format_reward/std": 0.23407234251499176, "rewards/curriculum_aware_reward_fn/mean": 0.0580357126891613, "rewards/curriculum_aware_reward_fn/std": 0.23407234251499176, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1753.0, "completions/max_terminated_length": 1753.0, "completions/mean_length": 418.9821472167969, "completions/mean_terminated_length": 418.9821472167969, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.07426508509541001, "grad_norm": 0.32584109902381897, "kl": 0.027439117431640625, "learning_rate": 1e-06, "loss": 0.0019, "num_tokens": 8464775.0, "reward": 1.03125, "reward_std": 0.1857805997133255, "rewards/code_format_reward/mean": 0.9598214030265808, "rewards/code_format_reward/std": 0.1965973675251007, "rewards/curriculum_aware_reward_fn/mean": 0.0714285746216774, "rewards/curriculum_aware_reward_fn/std": 0.2578272819519043, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1837.0, "completions/max_terminated_length": 1837.0, "completions/mean_length": 431.77679443359375, "completions/mean_terminated_length": 431.77679443359375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.07839092315626611, "grad_norm": 0.276391863822937, "kl": 0.02285003662109375, "learning_rate": 1e-06, "loss": 0.0142, "num_tokens": 8948690.0, "reward": 0.9977679252624512, "reward_std": 0.16488999128341675, "rewards/code_format_reward/mean": 0.953125, "rewards/code_format_reward/std": 0.21160738170146942, "rewards/curriculum_aware_reward_fn/mean": 0.0446428582072258, "rewards/curriculum_aware_reward_fn/std": 0.2067493200302124, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1555.0, "completions/max_terminated_length": 1555.0, "completions/mean_length": 459.4308166503906, "completions/mean_terminated_length": 459.4308166503906, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.08251676121712223, "grad_norm": 0.20041462779045105, "kl": 0.0071868896484375, "learning_rate": 1e-06, "loss": 0.0152, "num_tokens": 9423700.0, "reward": 1.03125, "reward_std": 0.14859183132648468, "rewards/code_format_reward/mean": 0.96875, "rewards/code_format_reward/std": 0.17418713867664337, "rewards/curriculum_aware_reward_fn/mean": 0.0625, "rewards/curriculum_aware_reward_fn/std": 0.24233205616474152, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1143.0, "completions/max_terminated_length": 1143.0, "completions/mean_length": 430.8437805175781, "completions/mean_terminated_length": 430.8437805175781, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.08664259927797834, "grad_norm": 0.17111293971538544, "kl": 0.007709503173828125, "learning_rate": 1e-06, "loss": 0.0123, "num_tokens": 9877699.0, "reward": 1.0357143878936768, "reward_std": 0.10029345005750656, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.0446428582072258, "rewards/curriculum_aware_reward_fn/std": 0.2067493349313736, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1626.0, "completions/max_terminated_length": 1626.0, "completions/mean_length": 432.40850830078125, "completions/mean_terminated_length": 432.40850830078125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.09076843733883445, "grad_norm": 0.17106418311595917, "kl": 0.008121490478515625, "learning_rate": 1e-06, "loss": 0.0114, "num_tokens": 10328549.0, "reward": 1.0446429252624512, "reward_std": 0.12917643785476685, "rewards/code_format_reward/mean": 0.984375, "rewards/code_format_reward/std": 0.12415824085474014, "rewards/curriculum_aware_reward_fn/mean": 0.0602678582072258, "rewards/curriculum_aware_reward_fn/std": 0.23824846744537354, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2522.0, "completions/max_terminated_length": 2522.0, "completions/mean_length": 423.3817138671875, "completions/mean_terminated_length": 423.3817138671875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.09489427539969056, "grad_norm": 0.19619248807430267, "kl": 0.013530731201171875, "learning_rate": 1e-06, "loss": 0.0145, "num_tokens": 10780773.0, "reward": 1.0200893878936768, "reward_std": 0.10209290683269501, "rewards/code_format_reward/mean": 0.9799107313156128, "rewards/code_format_reward/std": 0.14046260714530945, "rewards/curriculum_aware_reward_fn/mean": 0.0401785708963871, "rewards/curriculum_aware_reward_fn/std": 0.1965973675251007, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1691.0, "completions/max_terminated_length": 1691.0, "completions/mean_length": 432.2388610839844, "completions/mean_terminated_length": 432.2388610839844, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.09902011346054668, "grad_norm": 0.1739608198404312, "kl": 0.008785247802734375, "learning_rate": 1e-06, "loss": -0.0002, "num_tokens": 11246681.0, "reward": 1.0424107313156128, "reward_std": 0.1010638028383255, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.0513392873108387, "rewards/curriculum_aware_reward_fn/std": 0.22093555331230164, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1711.0, "completions/max_terminated_length": 1711.0, "completions/mean_length": 419.3035888671875, "completions/mean_terminated_length": 419.3035888671875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.10314595152140278, "grad_norm": 0.21773949265480042, "kl": 0.0096435546875, "learning_rate": 1e-06, "loss": 0.0147, "num_tokens": 11704946.0, "reward": 1.0647321939468384, "reward_std": 0.14116087555885315, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.0758928582072258, "rewards/curriculum_aware_reward_fn/std": 0.265122652053833, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1523.0, "completions/max_terminated_length": 1523.0, "completions/mean_length": 443.53350830078125, "completions/mean_terminated_length": 443.53350830078125, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.1072717895822589, "grad_norm": 0.170355424284935, "kl": 0.00921630859375, "learning_rate": 1e-06, "loss": 0.0094, "num_tokens": 12180968.0, "reward": 1.0647321939468384, "reward_std": 0.1207783967256546, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.0736607164144516, "rewards/curriculum_aware_reward_fn/std": 0.2615099549293518, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1110.0, "completions/max_terminated_length": 1110.0, "completions/mean_length": 409.91741943359375, "completions/mean_terminated_length": 409.91741943359375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.111397627643115, "grad_norm": 0.19676493108272552, "kl": 0.009613037109375, "learning_rate": 1e-06, "loss": 0.0062, "num_tokens": 12640131.0, "reward": 1.0691965818405151, "reward_std": 0.13159896433353424, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.0803571417927742, "rewards/curriculum_aware_reward_fn/std": 0.2721492052078247, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1575.0, "completions/max_terminated_length": 1575.0, "completions/mean_length": 409.0089416503906, "completions/mean_terminated_length": 409.0089416503906, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.11552346570397112, "grad_norm": 0.2110663503408432, "kl": 0.00991058349609375, "learning_rate": 1e-06, "loss": 0.0168, "num_tokens": 13094575.0, "reward": 1.0848214626312256, "reward_std": 0.1635512113571167, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.09375, "rewards/curriculum_aware_reward_fn/std": 0.2918064594268799, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1174.0, "completions/max_terminated_length": 1174.0, "completions/mean_length": 406.5558166503906, "completions/mean_terminated_length": 406.5558166503906, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.11964930376482723, "grad_norm": 0.2409823089838028, "kl": 0.010772705078125, "learning_rate": 1e-06, "loss": -0.0044, "num_tokens": 13553702.0, "reward": 1.055803656578064, "reward_std": 0.1305857002735138, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.0647321417927742, "rewards/curriculum_aware_reward_fn/std": 0.24632768332958221, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1854.0, "completions/max_terminated_length": 1854.0, "completions/mean_length": 410.54241943359375, "completions/mean_terminated_length": 410.54241943359375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.12377514182568335, "grad_norm": 0.19062326848506927, "kl": 0.01104736328125, "learning_rate": 1e-06, "loss": 0.0136, "num_tokens": 14000993.0, "reward": 1.0736607313156128, "reward_std": 0.12787634134292603, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.0803571417927742, "rewards/curriculum_aware_reward_fn/std": 0.2721492052078247, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1765.0, "completions/max_terminated_length": 1765.0, "completions/mean_length": 404.90179443359375, "completions/mean_terminated_length": 404.90179443359375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.12790097988653945, "grad_norm": 0.15294596552848816, "kl": 0.01055908203125, "learning_rate": 1e-06, "loss": 0.0089, "num_tokens": 14464321.0, "reward": 1.1116071939468384, "reward_std": 0.1003519669175148, "rewards/code_format_reward/mean": 1.0, "rewards/code_format_reward/std": 0.0, "rewards/curriculum_aware_reward_fn/mean": 0.1116071417927742, "rewards/curriculum_aware_reward_fn/std": 0.315234512090683, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1171.0, "completions/max_terminated_length": 1171.0, "completions/mean_length": 401.25225830078125, "completions/mean_terminated_length": 401.25225830078125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.13202681794739557, "grad_norm": 0.16327537596225739, "kl": 0.01148223876953125, "learning_rate": 1e-06, "loss": 0.0161, "num_tokens": 14914660.0, "reward": 1.095982313156128, "reward_std": 0.11626958847045898, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.1004464253783226, "rewards/curriculum_aware_reward_fn/std": 0.30093035101890564, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1627.0, "completions/max_terminated_length": 1627.0, "completions/mean_length": 405.7098388671875, "completions/mean_terminated_length": 405.7098388671875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.1361526560082517, "grad_norm": 0.1624334305524826, "kl": 0.010498046875, "learning_rate": 1e-06, "loss": 0.012, "num_tokens": 15374059.0, "reward": 1.0625, "reward_std": 0.11181927472352982, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.0647321417927742, "rewards/curriculum_aware_reward_fn/std": 0.24632768332958221, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1586.0, "completions/max_terminated_length": 1586.0, "completions/mean_length": 399.4040222167969, "completions/mean_terminated_length": 399.4040222167969, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.14027849406910778, "grad_norm": 0.16465216875076294, "kl": 0.01153564453125, "learning_rate": 1e-06, "loss": 0.0128, "num_tokens": 15814162.0, "reward": 1.046875, "reward_std": 0.09444627165794373, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.0491071417927742, "rewards/curriculum_aware_reward_fn/std": 0.2163332849740982, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1302.0, "completions/max_terminated_length": 1302.0, "completions/mean_length": 374.3169860839844, "completions/mean_terminated_length": 374.3169860839844, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.1444043321299639, "grad_norm": 0.17829546332359314, "kl": 0.0111541748046875, "learning_rate": 1e-06, "loss": 0.006, "num_tokens": 16266252.0, "reward": 1.0959821939468384, "reward_std": 0.13503600656986237, "rewards/code_format_reward/mean": 1.0, "rewards/code_format_reward/std": 0.0, "rewards/curriculum_aware_reward_fn/mean": 0.0959821417927742, "rewards/curriculum_aware_reward_fn/std": 0.29489606618881226, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1477.0, "completions/max_terminated_length": 1477.0, "completions/mean_length": 409.1852722167969, "completions/mean_terminated_length": 409.1852722167969, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.14853017019082002, "grad_norm": 0.1465103179216385, "kl": 0.0106964111328125, "learning_rate": 1e-06, "loss": 0.0027, "num_tokens": 16718843.0, "reward": 1.0669643878936768, "reward_std": 0.0910092145204544, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.0714285746216774, "rewards/curriculum_aware_reward_fn/std": 0.2578272819519043, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1358.0, "completions/max_terminated_length": 1358.0, "completions/mean_length": 394.8906555175781, "completions/mean_terminated_length": 394.8906555175781, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.15265600825167613, "grad_norm": 0.16230526566505432, "kl": 0.0101165771484375, "learning_rate": 1e-06, "loss": 0.004, "num_tokens": 17163826.0, "reward": 1.055803656578064, "reward_std": 0.09517396241426468, "rewards/code_format_reward/mean": 1.0, "rewards/code_format_reward/std": 0.0, "rewards/curriculum_aware_reward_fn/mean": 0.0558035708963871, "rewards/curriculum_aware_reward_fn/std": 0.2297986000776291, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1555.0, "completions/max_terminated_length": 1555.0, "completions/mean_length": 383.43304443359375, "completions/mean_terminated_length": 383.43304443359375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.15678184631253222, "grad_norm": 0.20570653676986694, "kl": 0.011016845703125, "learning_rate": 1e-06, "loss": 0.0094, "num_tokens": 17593092.0, "reward": 1.087053656578064, "reward_std": 0.152753084897995, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.0892857164144516, "rewards/curriculum_aware_reward_fn/std": 0.2854745090007782, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2609.0, "completions/max_terminated_length": 2609.0, "completions/mean_length": 397.0848388671875, "completions/mean_terminated_length": 397.0848388671875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.16090768437338834, "grad_norm": 0.17590661346912384, "kl": 0.01491546630859375, "learning_rate": 1e-06, "loss": 0.0037, "num_tokens": 18041627.0, "reward": 1.046875, "reward_std": 0.1003519594669342, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843171834946, "rewards/curriculum_aware_reward_fn/mean": 0.0535714291036129, "rewards/curriculum_aware_reward_fn/std": 0.2254217267036438, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 937.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 387.1004638671875, "completions/mean_terminated_length": 387.1004638671875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.16503352243424446, "grad_norm": 0.10793520510196686, "kl": 0.01047515869140625, "learning_rate": 1e-06, "loss": 0.0004, "num_tokens": 18478689.0, "reward": 1.008928656578064, "reward_std": 0.03543417155742645, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.01116071455180645, "rewards/curriculum_aware_reward_fn/std": 0.10517053306102753, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2713.0, "completions/max_terminated_length": 2713.0, "completions/mean_length": 398.80804443359375, "completions/mean_terminated_length": 398.80804443359375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.16915936049510058, "grad_norm": 0.15646153688430786, "kl": 0.0096588134765625, "learning_rate": 1e-06, "loss": 0.0094, "num_tokens": 18927975.0, "reward": 1.0625001192092896, "reward_std": 0.09590165317058563, "rewards/code_format_reward/mean": 1.0, "rewards/code_format_reward/std": 0.0, "rewards/curriculum_aware_reward_fn/mean": 0.0625, "rewards/curriculum_aware_reward_fn/std": 0.24233205616474152, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1454.0, "completions/max_terminated_length": 1454.0, "completions/mean_length": 341.2879638671875, "completions/mean_terminated_length": 341.2879638671875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.17328519855595667, "grad_norm": 0.22293926775455475, "kl": 0.0127105712890625, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 19322246.0, "reward": 1.140625, "reward_std": 0.15564143657684326, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349845170975, "rewards/curriculum_aware_reward_fn/mean": 0.1495535671710968, "rewards/curriculum_aware_reward_fn/std": 0.3570319712162018, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1197.0, "completions/max_terminated_length": 1197.0, "completions/mean_length": 379.36163330078125, "completions/mean_terminated_length": 379.36163330078125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.1774110366168128, "grad_norm": 0.28228673338890076, "kl": 0.030914306640625, "learning_rate": 1e-06, "loss": 0.0101, "num_tokens": 19752880.0, "reward": 1.0736607313156128, "reward_std": 0.11112767457962036, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.0758928582072258, "rewards/curriculum_aware_reward_fn/std": 0.265122652053833, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1011.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 377.8013610839844, "completions/mean_terminated_length": 377.8013610839844, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.1815368746776689, "grad_norm": 0.2990739047527313, "kl": 0.0368804931640625, "learning_rate": 1e-06, "loss": 0.0266, "num_tokens": 20167632.0, "reward": 1.0602679252624512, "reward_std": 0.14310891926288605, "rewards/code_format_reward/mean": 0.9866071343421936, "rewards/code_format_reward/std": 0.11507843434810638, "rewards/curriculum_aware_reward_fn/mean": 0.0736607164144516, "rewards/curriculum_aware_reward_fn/std": 0.2615099549293518, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1396.0, "completions/max_terminated_length": 1396.0, "completions/mean_length": 395.7812805175781, "completions/mean_terminated_length": 395.7812805175781, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.18566271273852503, "grad_norm": 0.1612071841955185, "kl": 0.01116180419921875, "learning_rate": 1e-06, "loss": 0.0025, "num_tokens": 20634364.0, "reward": 1.078125, "reward_std": 0.10277574509382248, "rewards/code_format_reward/mean": 1.0, "rewards/code_format_reward/std": 0.0, "rewards/curriculum_aware_reward_fn/mean": 0.078125, "rewards/curriculum_aware_reward_fn/std": 0.26866820454597473, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3270.0, "completions/mean_length": 392.9308166503906, "completions/mean_terminated_length": 384.64654541015625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.18978855079938112, "grad_norm": 0.2190452516078949, "kl": 0.01152801513671875, "learning_rate": 1e-06, "loss": 0.0279, "num_tokens": 21078546.0, "reward": 1.0691964626312256, "reward_std": 0.1653541624546051, "rewards/code_format_reward/mean": 0.984375, "rewards/code_format_reward/std": 0.12415824085474014, "rewards/curriculum_aware_reward_fn/mean": 0.0848214253783226, "rewards/curriculum_aware_reward_fn/std": 0.2789272665977478, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1489.0, "completions/max_terminated_length": 1489.0, "completions/mean_length": 409.63616943359375, "completions/mean_terminated_length": 409.63616943359375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.19391438886023724, "grad_norm": 0.19933559000492096, "kl": 0.01776885986328125, "learning_rate": 1e-06, "loss": 0.0148, "num_tokens": 21532193.0, "reward": 1.087053656578064, "reward_std": 0.1079537570476532, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.0915178582072258, "rewards/curriculum_aware_reward_fn/std": 0.2886664867401123, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 1039.0, "completions/mean_length": 384.1071472167969, "completions/mean_terminated_length": 375.8031311035156, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.19804022692109335, "grad_norm": 0.20565547049045563, "kl": 0.0159759521484375, "learning_rate": 1e-06, "loss": 0.0304, "num_tokens": 21958954.0, "reward": 1.040178656578064, "reward_std": 0.14413805305957794, "rewards/code_format_reward/mean": 0.9821428656578064, "rewards/code_format_reward/std": 0.13258016109466553, "rewards/curriculum_aware_reward_fn/mean": 0.0580357126891613, "rewards/curriculum_aware_reward_fn/std": 0.23407234251499176, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1152.0, "completions/max_terminated_length": 1152.0, "completions/mean_length": 367.13616943359375, "completions/mean_terminated_length": 367.13616943359375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.20216606498194944, "grad_norm": 0.29098284244537354, "kl": 0.01155853271484375, "learning_rate": 1e-06, "loss": -0.0052, "num_tokens": 22393365.0, "reward": 1.0334821939468384, "reward_std": 0.1472758650779724, "rewards/code_format_reward/mean": 0.96875, "rewards/code_format_reward/std": 0.17418713867664337, "rewards/curriculum_aware_reward_fn/mean": 0.0647321417927742, "rewards/curriculum_aware_reward_fn/std": 0.255248099565506, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1492.0, "completions/max_terminated_length": 1492.0, "completions/mean_length": 381.1160888671875, "completions/mean_terminated_length": 381.1160888671875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.20629190304280556, "grad_norm": 0.23193906247615814, "kl": 0.0115966796875, "learning_rate": 1e-06, "loss": 0.0055, "num_tokens": 22824561.0, "reward": 1.0647321939468384, "reward_std": 0.15034642815589905, "rewards/code_format_reward/mean": 0.96875, "rewards/code_format_reward/std": 0.17418713867664337, "rewards/curriculum_aware_reward_fn/mean": 0.0959821417927742, "rewards/curriculum_aware_reward_fn/std": 0.29489603638648987, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1760.0, "completions/max_terminated_length": 1760.0, "completions/mean_length": 369.1250305175781, "completions/mean_terminated_length": 369.1250305175781, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.21041774110366168, "grad_norm": 0.18956728279590607, "kl": 0.010467529296875, "learning_rate": 1e-06, "loss": 0.0206, "num_tokens": 23253829.0, "reward": 1.0513393878936768, "reward_std": 0.11356022208929062, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.0625, "rewards/curriculum_aware_reward_fn/std": 0.24233205616474152, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1148.0, "completions/max_terminated_length": 1148.0, "completions/mean_length": 403.7276916503906, "completions/mean_terminated_length": 403.7276916503906, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.2145435791645178, "grad_norm": 0.2548670470714569, "kl": 0.01268768310546875, "learning_rate": 1e-06, "loss": 0.0307, "num_tokens": 23725752.0, "reward": 1.024553656578064, "reward_std": 0.1569402664899826, "rewards/code_format_reward/mean": 0.96875, "rewards/code_format_reward/std": 0.17418713867664337, "rewards/curriculum_aware_reward_fn/mean": 0.0558035708963871, "rewards/curriculum_aware_reward_fn/std": 0.22979861497879028, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1264.0, "completions/max_terminated_length": 1264.0, "completions/mean_length": 379.87725830078125, "completions/mean_terminated_length": 379.87725830078125, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.2186694172253739, "grad_norm": 0.1965116709470749, "kl": 0.0096893310546875, "learning_rate": 1e-06, "loss": 0.0046, "num_tokens": 24159753.0, "reward": 1.102678656578064, "reward_std": 0.1614503264427185, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843171834946, "rewards/curriculum_aware_reward_fn/mean": 0.109375, "rewards/curriculum_aware_reward_fn/std": 0.3124580383300781, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1166.0, "completions/max_terminated_length": 1166.0, "completions/mean_length": 384.44866943359375, "completions/mean_terminated_length": 384.44866943359375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.22279525528623, "grad_norm": 0.16062971949577332, "kl": 0.01007843017578125, "learning_rate": 1e-06, "loss": 0.0012, "num_tokens": 24585498.0, "reward": 1.046875, "reward_std": 0.08679961413145065, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.0491071417927742, "rewards/curriculum_aware_reward_fn/std": 0.2163332849740982, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1895.0, "completions/max_terminated_length": 1895.0, "completions/mean_length": 372.33038330078125, "completions/mean_terminated_length": 372.33038330078125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.22692109334708613, "grad_norm": 0.16007283329963684, "kl": 0.01064300537109375, "learning_rate": 1e-06, "loss": 0.0012, "num_tokens": 25046624.0, "reward": 1.0580357313156128, "reward_std": 0.10524440556764603, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.0647321417927742, "rewards/curriculum_aware_reward_fn/std": 0.24632768332958221, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1242.0, "completions/max_terminated_length": 1242.0, "completions/mean_length": 379.7433166503906, "completions/mean_terminated_length": 379.7433166503906, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.23104693140794225, "grad_norm": 0.1447717249393463, "kl": 0.00945281982421875, "learning_rate": 1e-06, "loss": -0.0016, "num_tokens": 25506493.0, "reward": 1.0424107313156128, "reward_std": 0.07401982694864273, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.046875, "rewards/curriculum_aware_reward_fn/std": 0.21160738170146942, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1824.0, "completions/max_terminated_length": 1824.0, "completions/mean_length": 356.5245666503906, "completions/mean_terminated_length": 356.5245666503906, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.23517276946879834, "grad_norm": 0.16937297582626343, "kl": 0.01064300537109375, "learning_rate": 1e-06, "loss": 0.0144, "num_tokens": 25943962.0, "reward": 1.1004464626312256, "reward_std": 0.11289104074239731, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.1049107164144516, "rewards/curriculum_aware_reward_fn/std": 0.31398850679397583, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2182.0, "completions/max_terminated_length": 2182.0, "completions/mean_length": 350.82366943359375, "completions/mean_terminated_length": 350.82366943359375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.23929860752965446, "grad_norm": 0.21471168100833893, "kl": 0.01995849609375, "learning_rate": 1e-06, "loss": 0.0112, "num_tokens": 26362056.0, "reward": 1.0669643878936768, "reward_std": 0.1305496245622635, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.078125, "rewards/curriculum_aware_reward_fn/std": 0.26866820454597473, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1527.0, "completions/max_terminated_length": 1527.0, "completions/mean_length": 375.6651916503906, "completions/mean_terminated_length": 375.6651916503906, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.24342444559051057, "grad_norm": 0.15522673726081848, "kl": 0.00995635986328125, "learning_rate": 1e-06, "loss": 0.0019, "num_tokens": 26800204.0, "reward": 1.078125, "reward_std": 0.08754973858594894, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.0803571417927742, "rewards/curriculum_aware_reward_fn/std": 0.2721492052078247, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1278.0, "completions/max_terminated_length": 1278.0, "completions/mean_length": 353.2567138671875, "completions/mean_terminated_length": 353.2567138671875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.2475502836513667, "grad_norm": 0.2160206288099289, "kl": 0.0291595458984375, "learning_rate": 1e-06, "loss": 0.0081, "num_tokens": 27239954.0, "reward": 1.0758929252624512, "reward_std": 0.07396131008863449, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.0803571417927742, "rewards/curriculum_aware_reward_fn/std": 0.2721492052078247, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1213.0, "completions/max_terminated_length": 1213.0, "completions/mean_length": 358.9129638671875, "completions/mean_terminated_length": 358.9129638671875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.2516761217122228, "grad_norm": 0.250458687543869, "kl": 0.016815185546875, "learning_rate": 1e-06, "loss": 0.0206, "num_tokens": 27660972.0, "reward": 1.0781251192092896, "reward_std": 0.19869984686374664, "rewards/code_format_reward/mean": 0.9821428656578064, "rewards/code_format_reward/std": 0.13258016109466553, "rewards/curriculum_aware_reward_fn/mean": 0.0959821417927742, "rewards/curriculum_aware_reward_fn/std": 0.29489606618881226, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2409.0, "completions/max_terminated_length": 2409.0, "completions/mean_length": 367.5781555175781, "completions/mean_terminated_length": 367.5781555175781, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.2558019597730789, "grad_norm": 0.20474466681480408, "kl": 0.01043701171875, "learning_rate": 1e-06, "loss": 0.0036, "num_tokens": 28109434.0, "reward": 1.0647321939468384, "reward_std": 0.12642095983028412, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.0691964253783226, "rewards/curriculum_aware_reward_fn/std": 0.25407159328460693, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1192.0, "completions/max_terminated_length": 1192.0, "completions/mean_length": 347.2500305175781, "completions/mean_terminated_length": 347.2500305175781, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.259927797833935, "grad_norm": 0.18051838874816895, "kl": 0.01055908203125, "learning_rate": 1e-06, "loss": 0.0089, "num_tokens": 28527516.0, "reward": 1.0892857313156128, "reward_std": 0.13675454258918762, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.0915178582072258, "rewards/curriculum_aware_reward_fn/std": 0.2963150441646576, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1667.0, "completions/max_terminated_length": 1667.0, "completions/mean_length": 372.4508972167969, "completions/mean_terminated_length": 372.4508972167969, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.26405363589479114, "grad_norm": 0.1765376180410385, "kl": 0.01483917236328125, "learning_rate": 1e-06, "loss": 0.0051, "num_tokens": 28974255.0, "reward": 1.046875, "reward_std": 0.09173692017793655, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.0535714291036129, "rewards/curriculum_aware_reward_fn/std": 0.2254217267036438, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1340.0, "completions/max_terminated_length": 1340.0, "completions/mean_length": 342.65179443359375, "completions/mean_terminated_length": 342.65179443359375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.26817947395564723, "grad_norm": 0.19896657764911652, "kl": 0.01023101806640625, "learning_rate": 1e-06, "loss": 0.0052, "num_tokens": 29397144.0, "reward": 1.0602679252624512, "reward_std": 0.11952443420886993, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.0647321417927742, "rewards/curriculum_aware_reward_fn/std": 0.25524812936782837, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 1065.0, "completions/mean_length": 356.8660888671875, "completions/mean_terminated_length": 348.5011291503906, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.2723053120165034, "grad_norm": 0.16738773882389069, "kl": 0.01074981689453125, "learning_rate": 1e-06, "loss": 0.0307, "num_tokens": 29827059.0, "reward": 1.0513393878936768, "reward_std": 0.09272774308919907, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.0602678582072258, "rewards/curriculum_aware_reward_fn/std": 0.23824848234653473, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 935.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 321.3817138671875, "completions/mean_terminated_length": 321.3817138671875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.27643115007735947, "grad_norm": 0.17614111304283142, "kl": 0.0118865966796875, "learning_rate": 1e-06, "loss": 0.0107, "num_tokens": 30228127.0, "reward": 1.0647321939468384, "reward_std": 0.116653211414814, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.0714285746216774, "rewards/curriculum_aware_reward_fn/std": 0.2578272819519043, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1822.0, "completions/max_terminated_length": 1822.0, "completions/mean_length": 339.08929443359375, "completions/mean_terminated_length": 339.08929443359375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.28055698813821556, "grad_norm": 0.17406179010868073, "kl": 0.01302337646484375, "learning_rate": 1e-06, "loss": 0.0104, "num_tokens": 30644317.0, "reward": 1.0915179252624512, "reward_std": 0.11731892824172974, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.09375, "rewards/curriculum_aware_reward_fn/std": 0.2918064594268799, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1558.0, "completions/max_terminated_length": 1558.0, "completions/mean_length": 341.93975830078125, "completions/mean_terminated_length": 341.93975830078125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.2846828261990717, "grad_norm": 0.2198536992073059, "kl": 0.01514434814453125, "learning_rate": 1e-06, "loss": 0.0177, "num_tokens": 31055498.0, "reward": 1.0691964626312256, "reward_std": 0.13643288612365723, "rewards/code_format_reward/mean": 0.9866071343421936, "rewards/code_format_reward/std": 0.11507844179868698, "rewards/curriculum_aware_reward_fn/mean": 0.0825892835855484, "rewards/curriculum_aware_reward_fn/std": 0.2755681276321411, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 978.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 339.0, "completions/mean_terminated_length": 339.0, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.2888086642599278, "grad_norm": 0.20111602544784546, "kl": 0.01573944091796875, "learning_rate": 1e-06, "loss": -0.0081, "num_tokens": 31462989.0, "reward": 1.0446429252624512, "reward_std": 0.11356022953987122, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.0535714291036129, "rewards/curriculum_aware_reward_fn/std": 0.225421741604805, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1373.0, "completions/max_terminated_length": 1373.0, "completions/mean_length": 337.828125, "completions/mean_terminated_length": 337.828125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.2929345023207839, "grad_norm": 0.1859571784734726, "kl": 0.01120758056640625, "learning_rate": 1e-06, "loss": 0.0135, "num_tokens": 31888008.0, "reward": 1.118303656578064, "reward_std": 0.1212429627776146, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843171834946, "rewards/curriculum_aware_reward_fn/mean": 0.125, "rewards/curriculum_aware_reward_fn/std": 0.3310886323451996, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1212.0, "completions/max_terminated_length": 1212.0, "completions/mean_length": 348.5781555175781, "completions/mean_terminated_length": 348.5781555175781, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.29706034038164003, "grad_norm": 0.22600430250167847, "kl": 0.01161956787109375, "learning_rate": 1e-06, "loss": 0.0058, "num_tokens": 32324626.0, "reward": 1.071428656578064, "reward_std": 0.14830279350280762, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.0825892835855484, "rewards/curriculum_aware_reward_fn/std": 0.2755681276321411, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 1035.0, "completions/mean_length": 357.04241943359375, "completions/mean_terminated_length": 340.2757873535156, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.3011861784424961, "grad_norm": 0.23378655314445496, "kl": 0.022369384765625, "learning_rate": 1e-06, "loss": 0.0476, "num_tokens": 32749832.0, "reward": 1.1004464626312256, "reward_std": 0.1639774888753891, "rewards/code_format_reward/mean": 0.9866071343421936, "rewards/code_format_reward/std": 0.11507843434810638, "rewards/curriculum_aware_reward_fn/mean": 0.1138392835855484, "rewards/curriculum_aware_reward_fn/std": 0.34496763348579407, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1269.0, "completions/max_terminated_length": 1269.0, "completions/mean_length": 372.66741943359375, "completions/mean_terminated_length": 372.66741943359375, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.30531201650335227, "grad_norm": 0.18296393752098083, "kl": 0.0102996826171875, "learning_rate": 1e-06, "loss": 0.005, "num_tokens": 33209590.0, "reward": 1.0669643878936768, "reward_std": 0.11331482231616974, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.0758928582072258, "rewards/curriculum_aware_reward_fn/std": 0.265122652053833, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1985.0, "completions/max_terminated_length": 1985.0, "completions/mean_length": 364.1875305175781, "completions/mean_terminated_length": 364.1875305175781, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.30943785456420836, "grad_norm": 0.20619679987430573, "kl": 0.0104827880859375, "learning_rate": 1e-06, "loss": 0.0096, "num_tokens": 33639089.0, "reward": 1.078125, "reward_std": 0.1350584328174591, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.0870535746216774, "rewards/curriculum_aware_reward_fn/std": 0.2822286784648895, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2105.0, "completions/max_terminated_length": 2105.0, "completions/mean_length": 380.7879638671875, "completions/mean_terminated_length": 380.7879638671875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.31356369262506445, "grad_norm": 0.211038738489151, "kl": 0.01093292236328125, "learning_rate": 1e-06, "loss": 0.0249, "num_tokens": 34094659.0, "reward": 1.0736607313156128, "reward_std": 0.14870882034301758, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.0825892835855484, "rewards/curriculum_aware_reward_fn/std": 0.2755681276321411, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1482.0, "completions/max_terminated_length": 1482.0, "completions/mean_length": 384.37725830078125, "completions/mean_terminated_length": 384.37725830078125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.3176895306859206, "grad_norm": 0.1940946727991104, "kl": 0.0115814208984375, "learning_rate": 1e-06, "loss": 0.0018, "num_tokens": 34546174.0, "reward": 1.1160714626312256, "reward_std": 0.160459503531456, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.1272321492433548, "rewards/curriculum_aware_reward_fn/std": 0.36560073494911194, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 925.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 362.2410888671875, "completions/mean_terminated_length": 362.2410888671875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.3218153687467767, "grad_norm": 0.22700199484825134, "kl": 0.01206207275390625, "learning_rate": 1e-06, "loss": 0.011, "num_tokens": 34984006.0, "reward": 1.0803571939468384, "reward_std": 0.1534363329410553, "rewards/code_format_reward/mean": 0.9866071343421936, "rewards/code_format_reward/std": 0.11507844179868698, "rewards/curriculum_aware_reward_fn/mean": 0.09375, "rewards/curriculum_aware_reward_fn/std": 0.2918064594268799, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1612.0, "completions/max_terminated_length": 1612.0, "completions/mean_length": 370.8437805175781, "completions/mean_terminated_length": 370.8437805175781, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.3259412068076328, "grad_norm": 0.23479129374027252, "kl": 0.01564788818359375, "learning_rate": 1e-06, "loss": 0.0141, "num_tokens": 35421429.0, "reward": 1.087053656578064, "reward_std": 0.16331051290035248, "rewards/code_format_reward/mean": 0.9799107313156128, "rewards/code_format_reward/std": 0.14046260714530945, "rewards/curriculum_aware_reward_fn/mean": 0.1071428582072258, "rewards/curriculum_aware_reward_fn/std": 0.3096405565738678, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1162.0, "completions/max_terminated_length": 1162.0, "completions/mean_length": 362.3326110839844, "completions/mean_terminated_length": 362.3326110839844, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.3300670448684889, "grad_norm": 0.22646324336528778, "kl": 0.0144500732421875, "learning_rate": 1e-06, "loss": 0.0106, "num_tokens": 35846044.0, "reward": 1.0535714626312256, "reward_std": 0.14729610085487366, "rewards/code_format_reward/mean": 0.9754464030265808, "rewards/code_format_reward/std": 0.1549331247806549, "rewards/curriculum_aware_reward_fn/mean": 0.078125, "rewards/curriculum_aware_reward_fn/std": 0.26866820454597473, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1402.0, "completions/max_terminated_length": 1402.0, "completions/mean_length": 370.5915222167969, "completions/mean_terminated_length": 370.5915222167969, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.334192882929345, "grad_norm": 0.2028733342885971, "kl": 0.012664794921875, "learning_rate": 1e-06, "loss": 0.0154, "num_tokens": 36283473.0, "reward": 1.118303656578064, "reward_std": 0.1441156268119812, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843171834946, "rewards/curriculum_aware_reward_fn/mean": 0.125, "rewards/curriculum_aware_reward_fn/std": 0.3570949137210846, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1120.0, "completions/max_terminated_length": 1120.0, "completions/mean_length": 366.57366943359375, "completions/mean_terminated_length": 366.57366943359375, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.33831872099020116, "grad_norm": 0.20468416810035706, "kl": 0.0113372802734375, "learning_rate": 1e-06, "loss": 0.0079, "num_tokens": 36716022.0, "reward": 1.078125, "reward_std": 0.15004372596740723, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.0892857164144516, "rewards/curriculum_aware_reward_fn/std": 0.2854744791984558, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1659.0, "completions/max_terminated_length": 1659.0, "completions/mean_length": 362.0602722167969, "completions/mean_terminated_length": 362.0602722167969, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.34244455905105725, "grad_norm": 0.18309882283210754, "kl": 0.01456451416015625, "learning_rate": 1e-06, "loss": -0.0045, "num_tokens": 37170209.0, "reward": 1.09375, "reward_std": 0.10698535293340683, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843171834946, "rewards/curriculum_aware_reward_fn/mean": 0.1004464253783226, "rewards/curriculum_aware_reward_fn/std": 0.30093035101890564, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 1632.0, "completions/mean_length": 392.40179443359375, "completions/mean_terminated_length": 384.1163330078125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.34657039711191334, "grad_norm": 0.19598782062530518, "kl": 0.012786865234375, "learning_rate": 1e-06, "loss": 0.0085, "num_tokens": 37628938.0, "reward": 1.0647321939468384, "reward_std": 0.12604080140590668, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.0758928582072258, "rewards/curriculum_aware_reward_fn/std": 0.2651226818561554, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1454.0, "completions/max_terminated_length": 1454.0, "completions/mean_length": 373.6964416503906, "completions/mean_terminated_length": 373.6964416503906, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.3506962351727695, "grad_norm": 0.2011183202266693, "kl": 0.01247406005859375, "learning_rate": 1e-06, "loss": 0.0092, "num_tokens": 38077082.0, "reward": 1.0580357313156128, "reward_std": 0.12462149560451508, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.0669642835855484, "rewards/curriculum_aware_reward_fn/std": 0.2502395808696747, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 887.0, "completions/max_terminated_length": 887.0, "completions/mean_length": 331.75225830078125, "completions/mean_terminated_length": 331.75225830078125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.3548220732336256, "grad_norm": 0.18622373044490814, "kl": 0.0123291015625, "learning_rate": 1e-06, "loss": 0.0094, "num_tokens": 38483251.0, "reward": 1.087053656578064, "reward_std": 0.12022969126701355, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.0915178582072258, "rewards/curriculum_aware_reward_fn/std": 0.2886664867401123, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 1164.0, "completions/mean_length": 362.0067138671875, "completions/mean_terminated_length": 353.65325927734375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.35894791129448167, "grad_norm": 0.17447015643119812, "kl": 0.01161956787109375, "learning_rate": 1e-06, "loss": 0.0348, "num_tokens": 38929784.0, "reward": 1.0513393878936768, "reward_std": 0.11849531531333923, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.0602678582072258, "rewards/curriculum_aware_reward_fn/std": 0.23824848234653473, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1025.0, "completions/max_terminated_length": 1025.0, "completions/mean_length": 359.0133972167969, "completions/mean_terminated_length": 359.0133972167969, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.3630737493553378, "grad_norm": 0.23724974691867828, "kl": 0.01897430419921875, "learning_rate": 1e-06, "loss": 0.0063, "num_tokens": 39366338.0, "reward": 1.0803571939468384, "reward_std": 0.10008881986141205, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.0848214253783226, "rewards/curriculum_aware_reward_fn/std": 0.2789272665977478, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1081.0, "completions/max_terminated_length": 1081.0, "completions/mean_length": 367.9576110839844, "completions/mean_terminated_length": 367.9576110839844, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.3671995874161939, "grad_norm": 0.16972604393959045, "kl": 0.01207733154296875, "learning_rate": 1e-06, "loss": 0.0109, "num_tokens": 39813059.0, "reward": 1.0558035373687744, "reward_std": 0.12033183872699738, "rewards/code_format_reward/mean": 0.9821428656578064, "rewards/code_format_reward/std": 0.13258016109466553, "rewards/curriculum_aware_reward_fn/mean": 0.0736607164144516, "rewards/curriculum_aware_reward_fn/std": 0.2615099549293518, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3521.0, "completions/max_terminated_length": 3521.0, "completions/mean_length": 380.55804443359375, "completions/mean_terminated_length": 380.55804443359375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.37132542547705005, "grad_norm": 0.13963274657726288, "kl": 0.01036834716796875, "learning_rate": 1e-06, "loss": 0.0144, "num_tokens": 40263080.0, "reward": 1.055803656578064, "reward_std": 0.07227887958288193, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.0580357126891613, "rewards/curriculum_aware_reward_fn/std": 0.23407234251499176, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1107.0, "completions/max_terminated_length": 1107.0, "completions/mean_length": 354.12054443359375, "completions/mean_terminated_length": 354.12054443359375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.37545126353790614, "grad_norm": 0.18742915987968445, "kl": 0.01462554931640625, "learning_rate": 1e-06, "loss": 0.0081, "num_tokens": 40699906.0, "reward": 1.078125, "reward_std": 0.13052719831466675, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843171834946, "rewards/curriculum_aware_reward_fn/mean": 0.0848214253783226, "rewards/curriculum_aware_reward_fn/std": 0.2789272665977478, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2277.0, "completions/max_terminated_length": 2277.0, "completions/mean_length": 355.04241943359375, "completions/mean_terminated_length": 355.04241943359375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.37957710159876223, "grad_norm": 0.18381254374980927, "kl": 0.0133056640625, "learning_rate": 1e-06, "loss": 0.021, "num_tokens": 41117629.0, "reward": 1.078125, "reward_std": 0.11216334998607635, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.0848214253783226, "rewards/curriculum_aware_reward_fn/std": 0.286835640668869, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1387.0, "completions/max_terminated_length": 1387.0, "completions/mean_length": 331.6964416503906, "completions/mean_terminated_length": 331.6964416503906, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.3837029396596184, "grad_norm": 0.21641084551811218, "kl": 0.01242828369140625, "learning_rate": 1e-06, "loss": 0.0225, "num_tokens": 41539698.0, "reward": 1.1294643878936768, "reward_std": 0.1750185638666153, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.1339285671710968, "rewards/curriculum_aware_reward_fn/std": 0.3538357615470886, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1159.0, "completions/max_terminated_length": 1159.0, "completions/mean_length": 358.2544860839844, "completions/mean_terminated_length": 358.2544860839844, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.38782877772047447, "grad_norm": 0.16547773778438568, "kl": 0.01194000244140625, "learning_rate": 1e-06, "loss": 0.0182, "num_tokens": 41965706.0, "reward": 1.0758929252624512, "reward_std": 0.10550753027200699, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.078125, "rewards/curriculum_aware_reward_fn/std": 0.26866820454597473, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1836.0, "completions/max_terminated_length": 1836.0, "completions/mean_length": 358.5848388671875, "completions/mean_terminated_length": 358.5848388671875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.39195461578133056, "grad_norm": 0.19228342175483704, "kl": 0.01303863525390625, "learning_rate": 1e-06, "loss": 0.0074, "num_tokens": 42402082.0, "reward": 1.0691964626312256, "reward_std": 0.11531484127044678, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.078125, "rewards/curriculum_aware_reward_fn/std": 0.26866820454597473, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1142.0, "completions/max_terminated_length": 1142.0, "completions/mean_length": 370.9732360839844, "completions/mean_terminated_length": 370.9732360839844, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.3960804538421867, "grad_norm": 0.1713048368692398, "kl": 0.0135040283203125, "learning_rate": 1e-06, "loss": 0.0146, "num_tokens": 42839507.0, "reward": 1.1004465818405151, "reward_std": 0.10174884647130966, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.1116071417927742, "rewards/curriculum_aware_reward_fn/std": 0.31523454189300537, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1210.0, "completions/max_terminated_length": 1210.0, "completions/mean_length": 366.65850830078125, "completions/mean_terminated_length": 366.65850830078125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.4002062919030428, "grad_norm": 0.20571546256542206, "kl": 0.01522064208984375, "learning_rate": 1e-06, "loss": 0.0173, "num_tokens": 43269749.0, "reward": 1.1205358505249023, "reward_std": 0.1542084813117981, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.1272321492433548, "rewards/curriculum_aware_reward_fn/std": 0.3716694116592407, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 1547.0, "completions/mean_length": 392.9285888671875, "completions/mean_terminated_length": 376.3228759765625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.4043321299638989, "grad_norm": 0.21601620316505432, "kl": 0.01264190673828125, "learning_rate": 1e-06, "loss": 0.0601, "num_tokens": 43721455.0, "reward": 1.0848214626312256, "reward_std": 0.17263084650039673, "rewards/code_format_reward/mean": 0.9866071343421936, "rewards/code_format_reward/std": 0.11507844179868698, "rewards/curriculum_aware_reward_fn/mean": 0.0982142835855484, "rewards/curriculum_aware_reward_fn/std": 0.29793688654899597, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1214.0, "completions/max_terminated_length": 1214.0, "completions/mean_length": 343.9263610839844, "completions/mean_terminated_length": 343.9263610839844, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.40845796802475504, "grad_norm": 0.2037304788827896, "kl": 0.02156829833984375, "learning_rate": 1e-06, "loss": -0.0052, "num_tokens": 44145748.0, "reward": 1.0825893878936768, "reward_std": 0.09200004488229752, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.0892857164144516, "rewards/curriculum_aware_reward_fn/std": 0.2854745090007782, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1003.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 351.9196472167969, "completions/mean_terminated_length": 351.9196472167969, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.4125838060856111, "grad_norm": 0.20430681109428406, "kl": 0.01955413818359375, "learning_rate": 1e-06, "loss": 0.0063, "num_tokens": 44568043.0, "reward": 1.102678656578064, "reward_std": 0.1333174854516983, "rewards/code_format_reward/mean": 0.9866071343421936, "rewards/code_format_reward/std": 0.11507844179868698, "rewards/curriculum_aware_reward_fn/mean": 0.1160714253783226, "rewards/curriculum_aware_reward_fn/std": 0.32066863775253296, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1187.0, "completions/max_terminated_length": 1187.0, "completions/mean_length": 371.4375305175781, "completions/mean_terminated_length": 371.4375305175781, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.4167096441464673, "grad_norm": 0.19734680652618408, "kl": 0.0188446044921875, "learning_rate": 1e-06, "loss": 0.0027, "num_tokens": 45017457.0, "reward": 1.040178656578064, "reward_std": 0.11046725511550903, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.0491071417927742, "rewards/curriculum_aware_reward_fn/std": 0.2163332849740982, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1029.0, "completions/max_terminated_length": 1029.0, "completions/mean_length": 353.74554443359375, "completions/mean_terminated_length": 353.74554443359375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.42083548220732336, "grad_norm": 0.19347746670246124, "kl": 0.01445770263671875, "learning_rate": 1e-06, "loss": 0.0036, "num_tokens": 45461283.0, "reward": 1.0959821939468384, "reward_std": 0.13922318816184998, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.1049107164144516, "rewards/curriculum_aware_reward_fn/std": 0.3067809045314789, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 1916.0, "completions/mean_length": 382.8594055175781, "completions/mean_terminated_length": 374.5525817871094, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.42496132026817945, "grad_norm": 0.22364647686481476, "kl": 0.0173492431640625, "learning_rate": 1e-06, "loss": 0.0405, "num_tokens": 45913238.0, "reward": 1.064732313156128, "reward_std": 0.15485522150993347, "rewards/code_format_reward/mean": 0.9866071343421936, "rewards/code_format_reward/std": 0.11507844179868698, "rewards/curriculum_aware_reward_fn/mean": 0.078125, "rewards/curriculum_aware_reward_fn/std": 0.26866820454597473, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1555.0, "completions/max_terminated_length": 1555.0, "completions/mean_length": 384.81475830078125, "completions/mean_terminated_length": 384.81475830078125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.4290871583290356, "grad_norm": 0.19948019087314606, "kl": 0.0163116455078125, "learning_rate": 1e-06, "loss": 0.0194, "num_tokens": 46341618.0, "reward": 1.0513393878936768, "reward_std": 0.13305436074733734, "rewards/code_format_reward/mean": 0.9821428656578064, "rewards/code_format_reward/std": 0.13258016109466553, "rewards/curriculum_aware_reward_fn/mean": 0.0691964253783226, "rewards/curriculum_aware_reward_fn/std": 0.25407159328460693, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2470.0, "completions/max_terminated_length": 2470.0, "completions/mean_length": 347.5915222167969, "completions/mean_terminated_length": 347.5915222167969, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.4332129963898917, "grad_norm": 0.21573631465435028, "kl": 0.0137481689453125, "learning_rate": 1e-06, "loss": 0.0313, "num_tokens": 46757417.0, "reward": 1.1361607313156128, "reward_std": 0.17457421123981476, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.1428571492433548, "rewards/curriculum_aware_reward_fn/std": 0.3503182828426361, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1106.0, "completions/max_terminated_length": 1106.0, "completions/mean_length": 355.9620666503906, "completions/mean_terminated_length": 355.9620666503906, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.4373388344507478, "grad_norm": 0.19402246177196503, "kl": 0.013671875, "learning_rate": 1e-06, "loss": 0.0199, "num_tokens": 47190924.0, "reward": 1.0580358505249023, "reward_std": 0.1131766065955162, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.0625, "rewards/curriculum_aware_reward_fn/std": 0.24233205616474152, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2242.0, "completions/max_terminated_length": 2242.0, "completions/mean_length": 354.1919860839844, "completions/mean_terminated_length": 354.1919860839844, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.44146467251160393, "grad_norm": 0.1923978328704834, "kl": 0.01230621337890625, "learning_rate": 1e-06, "loss": 0.0144, "num_tokens": 47602407.0, "reward": 1.1160714626312256, "reward_std": 0.1504049301147461, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.125, "rewards/curriculum_aware_reward_fn/std": 0.3310886323451996, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 343.07366943359375, "completions/mean_terminated_length": 334.6778564453125, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.44559051057246, "grad_norm": 0.22708895802497864, "kl": 0.01428985595703125, "learning_rate": 1e-06, "loss": 0.0344, "num_tokens": 48024232.0, "reward": 1.1316965818405151, "reward_std": 0.16917134821414948, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.140625, "rewards/curriculum_aware_reward_fn/std": 0.3480229377746582, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1502.0, "completions/max_terminated_length": 1502.0, "completions/mean_length": 327.47100830078125, "completions/mean_terminated_length": 327.47100830078125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.44971634863331617, "grad_norm": 0.19042733311653137, "kl": 0.0147552490234375, "learning_rate": 1e-06, "loss": 0.0126, "num_tokens": 48438318.0, "reward": 1.1205357313156128, "reward_std": 0.13614732027053833, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.125, "rewards/curriculum_aware_reward_fn/std": 0.3310886323451996, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1403.0, "completions/max_terminated_length": 1403.0, "completions/mean_length": 349.0602722167969, "completions/mean_terminated_length": 349.0602722167969, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.45384218669417226, "grad_norm": 0.20683661103248596, "kl": 0.0180206298828125, "learning_rate": 1e-06, "loss": 0.0054, "num_tokens": 48866717.0, "reward": 1.087053656578064, "reward_std": 0.13611124455928802, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.0982142835855484, "rewards/curriculum_aware_reward_fn/std": 0.29793688654899597, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1239.0, "completions/max_terminated_length": 1239.0, "completions/mean_length": 316.9129638671875, "completions/mean_terminated_length": 316.9129638671875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.45796802475502835, "grad_norm": 0.30014339089393616, "kl": 0.025299072265625, "learning_rate": 1e-06, "loss": 0.0073, "num_tokens": 49254142.0, "reward": 1.1227679252624512, "reward_std": 0.1876002848148346, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.1316964328289032, "rewards/curriculum_aware_reward_fn/std": 0.3385384678840637, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1656.0, "completions/max_terminated_length": 1656.0, "completions/mean_length": 339.7723388671875, "completions/mean_terminated_length": 339.7723388671875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.4620938628158845, "grad_norm": 0.17152318358421326, "kl": 0.01714324951171875, "learning_rate": 1e-06, "loss": 0.0173, "num_tokens": 49688271.0, "reward": 1.087053656578064, "reward_std": 0.10179148614406586, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.09375, "rewards/curriculum_aware_reward_fn/std": 0.2918064594268799, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 350.7120666503906, "completions/mean_terminated_length": 342.3333435058594, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.4662197008767406, "grad_norm": 0.22815488278865814, "kl": 0.015411376953125, "learning_rate": 1e-06, "loss": 0.0364, "num_tokens": 50130577.0, "reward": 1.087053656578064, "reward_std": 0.1431247889995575, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.0959821417927742, "rewards/curriculum_aware_reward_fn/std": 0.29489603638648987, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1962.0, "completions/max_terminated_length": 1962.0, "completions/mean_length": 320.9888610839844, "completions/mean_terminated_length": 320.9888610839844, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.4703455389375967, "grad_norm": 0.21225294470787048, "kl": 0.01531219482421875, "learning_rate": 1e-06, "loss": 0.0136, "num_tokens": 50526569.0, "reward": 1.0625, "reward_std": 0.13894417881965637, "rewards/code_format_reward/mean": 0.9866071343421936, "rewards/code_format_reward/std": 0.11507843434810638, "rewards/curriculum_aware_reward_fn/mean": 0.0758928582072258, "rewards/curriculum_aware_reward_fn/std": 0.2651226818561554, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1042.0, "completions/max_terminated_length": 1042.0, "completions/mean_length": 318.9754638671875, "completions/mean_terminated_length": 318.9754638671875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.4744713769984528, "grad_norm": 0.221530482172966, "kl": 0.0149688720703125, "learning_rate": 1e-06, "loss": 0.0221, "num_tokens": 50930927.0, "reward": 1.125, "reward_std": 0.1612457036972046, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.1294642835855484, "rewards/curriculum_aware_reward_fn/std": 0.35549718141555786, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1136.0, "completions/max_terminated_length": 1136.0, "completions/mean_length": 336.91741943359375, "completions/mean_terminated_length": 336.91741943359375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.4785972150593089, "grad_norm": 0.17955932021141052, "kl": 0.01438140869140625, "learning_rate": 1e-06, "loss": 0.01, "num_tokens": 51347320.0, "reward": 1.0959821939468384, "reward_std": 0.087205670773983, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843171834946, "rewards/curriculum_aware_reward_fn/mean": 0.1026785746216774, "rewards/curriculum_aware_reward_fn/std": 0.30387789011001587, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1404.0, "completions/max_terminated_length": 1404.0, "completions/mean_length": 342.7321472167969, "completions/mean_terminated_length": 342.7321472167969, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.48272305312016506, "grad_norm": 0.1980082243680954, "kl": 0.01482391357421875, "learning_rate": 1e-06, "loss": 0.0043, "num_tokens": 51779814.0, "reward": 1.1160714626312256, "reward_std": 0.14484331011772156, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.1205357164144516, "rewards/curriculum_aware_reward_fn/std": 0.3259509205818176, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1083.0, "completions/max_terminated_length": 1083.0, "completions/mean_length": 322.49554443359375, "completions/mean_terminated_length": 322.49554443359375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.48684889118102115, "grad_norm": 0.1956605315208435, "kl": 0.01366424560546875, "learning_rate": 1e-06, "loss": 0.0137, "num_tokens": 52195514.0, "reward": 1.0803571939468384, "reward_std": 0.1101592630147934, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843171834946, "rewards/curriculum_aware_reward_fn/mean": 0.0870535746216774, "rewards/curriculum_aware_reward_fn/std": 0.2822286784648895, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 839.0, "completions/max_terminated_length": 839.0, "completions/mean_length": 324.37725830078125, "completions/mean_terminated_length": 324.37725830078125, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.49097472924187724, "grad_norm": 0.1853463053703308, "kl": 0.0139007568359375, "learning_rate": 1e-06, "loss": 0.0109, "num_tokens": 52613289.0, "reward": 1.0892857313156128, "reward_std": 0.10771305114030838, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.0915178582072258, "rewards/curriculum_aware_reward_fn/std": 0.3319246470928192, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 320.3258972167969, "completions/mean_terminated_length": 311.8791809082031, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.4951005673027334, "grad_norm": 0.2443920075893402, "kl": 0.02069854736328125, "learning_rate": 1e-06, "loss": 0.0391, "num_tokens": 53019386.0, "reward": 1.1071429252624512, "reward_std": 0.17067037522792816, "rewards/code_format_reward/mean": 0.9821428656578064, "rewards/code_format_reward/std": 0.13258016109466553, "rewards/curriculum_aware_reward_fn/mean": 0.125, "rewards/curriculum_aware_reward_fn/std": 0.3310886323451996, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 1152.0, "completions/mean_length": 323.3035888671875, "completions/mean_terminated_length": 314.863525390625, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.4992264053635895, "grad_norm": 0.19038715958595276, "kl": 0.01386260986328125, "learning_rate": 1e-06, "loss": 0.0331, "num_tokens": 53431374.0, "reward": 1.0758929252624512, "reward_std": 0.12470243126153946, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.0825892835855484, "rewards/curriculum_aware_reward_fn/std": 0.2755681276321411, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 920.0, "completions/max_terminated_length": 920.0, "completions/mean_length": 307.5826110839844, "completions/mean_terminated_length": 307.5826110839844, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.5033522434244456, "grad_norm": 0.2671750783920288, "kl": 0.01509857177734375, "learning_rate": 1e-06, "loss": 0.0049, "num_tokens": 53835066.0, "reward": 1.1450893878936768, "reward_std": 0.2060384899377823, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.1517857164144516, "rewards/curriculum_aware_reward_fn/std": 0.3592142164707184, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1005.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 318.1875, "completions/mean_terminated_length": 318.1875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.5074780814853017, "grad_norm": 0.19120195508003235, "kl": 0.02199554443359375, "learning_rate": 1e-06, "loss": 0.0099, "num_tokens": 54259301.0, "reward": 1.0915179252624512, "reward_std": 0.11772496998310089, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.1004464253783226, "rewards/curriculum_aware_reward_fn/std": 0.30093035101890564, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1030.0, "completions/max_terminated_length": 1030.0, "completions/mean_length": 296.36163330078125, "completions/mean_terminated_length": 296.36163330078125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.5116039195461578, "grad_norm": 0.20886196196079254, "kl": 0.01728057861328125, "learning_rate": 1e-06, "loss": 0.0111, "num_tokens": 54648138.0, "reward": 1.0758929252624512, "reward_std": 0.12540769577026367, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.0848214253783226, "rewards/curriculum_aware_reward_fn/std": 0.2789272665977478, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 845.0, "completions/max_terminated_length": 845.0, "completions/mean_length": 304.55804443359375, "completions/mean_terminated_length": 304.55804443359375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.5157297576070139, "grad_norm": 0.255563348531723, "kl": 0.019866943359375, "learning_rate": 1e-06, "loss": 0.0025, "num_tokens": 55047066.0, "reward": 1.087053656578064, "reward_std": 0.1697235256433487, "rewards/code_format_reward/mean": 0.9821428656578064, "rewards/code_format_reward/std": 0.13258016109466553, "rewards/curriculum_aware_reward_fn/mean": 0.1049107164144516, "rewards/curriculum_aware_reward_fn/std": 0.3067809045314789, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 903.0, "completions/max_terminated_length": 903.0, "completions/mean_length": 290.7901916503906, "completions/mean_terminated_length": 290.7901916503906, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.51985559566787, "grad_norm": 0.23012115061283112, "kl": 0.02146148681640625, "learning_rate": 1e-06, "loss": 0.0238, "num_tokens": 55436753.0, "reward": 1.1227679252624512, "reward_std": 0.1416693925857544, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164842426776886, "rewards/curriculum_aware_reward_fn/mean": 0.1294642835855484, "rewards/curriculum_aware_reward_fn/std": 0.3360884189605713, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 995.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 305.7477722167969, "completions/mean_terminated_length": 305.7477722167969, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.5239814337287262, "grad_norm": 0.1966557800769806, "kl": 0.017059326171875, "learning_rate": 1e-06, "loss": 0.0088, "num_tokens": 55847134.0, "reward": 1.0758929252624512, "reward_std": 0.1400330662727356, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.0848214253783226, "rewards/curriculum_aware_reward_fn/std": 0.2789272665977478, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 1037.0, "completions/mean_length": 309.9285888671875, "completions/mean_terminated_length": 301.4586181640625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.5281072717895823, "grad_norm": 0.18832871317863464, "kl": 0.01653289794921875, "learning_rate": 1e-06, "loss": 0.0282, "num_tokens": 56252633.0, "reward": 1.09375, "reward_std": 0.12293906509876251, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.0982142835855484, "rewards/curriculum_aware_reward_fn/std": 0.29793688654899597, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 944.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 312.68975830078125, "completions/mean_terminated_length": 312.68975830078125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.5322331098504384, "grad_norm": 0.16387410461902618, "kl": 0.0173797607421875, "learning_rate": 1e-06, "loss": 0.0075, "num_tokens": 56690687.0, "reward": 1.071428656578064, "reward_std": 0.09371857345104218, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.0736607164144516, "rewards/curriculum_aware_reward_fn/std": 0.2615099549293518, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 296.97100830078125, "completions/mean_terminated_length": 296.97100830078125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.5363589479112945, "grad_norm": 0.23210418224334717, "kl": 0.02040863037109375, "learning_rate": 1e-06, "loss": 0.0047, "num_tokens": 57090374.0, "reward": 1.1272321939468384, "reward_std": 0.14067856967449188, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.1294642835855484, "rewards/curriculum_aware_reward_fn/std": 0.3491474688053131, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 1354.0, "completions/mean_length": 308.7120666503906, "completions/mean_terminated_length": 300.2393798828125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.5404847859721505, "grad_norm": 0.18966835737228394, "kl": 0.0182342529296875, "learning_rate": 1e-06, "loss": 0.0241, "num_tokens": 57506682.0, "reward": 1.102678656578064, "reward_std": 0.12440101057291031, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.109375, "rewards/curriculum_aware_reward_fn/std": 0.3124580383300781, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1356.0, "completions/max_terminated_length": 1356.0, "completions/mean_length": 316.7901916503906, "completions/mean_terminated_length": 316.7901916503906, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.5446106240330068, "grad_norm": 0.7599146366119385, "kl": 0.13059234619140625, "learning_rate": 1e-06, "loss": 0.0102, "num_tokens": 57931131.0, "reward": 1.0959821939468384, "reward_std": 0.13372354209423065, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.1071428582072258, "rewards/curriculum_aware_reward_fn/std": 0.3096405565738678, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 888.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 302.4933166503906, "completions/mean_terminated_length": 302.4933166503906, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.5487364620938628, "grad_norm": 0.18858854472637177, "kl": 0.01956939697265625, "learning_rate": 1e-06, "loss": 0.0108, "num_tokens": 58345115.0, "reward": 1.0669643878936768, "reward_std": 0.10107965767383575, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.0714285746216774, "rewards/curriculum_aware_reward_fn/std": 0.2578272819519043, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 830.0, "completions/mean_length": 323.3526916503906, "completions/mean_terminated_length": 314.9127502441406, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.5528623001547189, "grad_norm": 0.18076685070991516, "kl": 0.01917266845703125, "learning_rate": 1e-06, "loss": 0.0332, "num_tokens": 58760485.0, "reward": 1.0625, "reward_std": 0.11533726006746292, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.0669642835855484, "rewards/curriculum_aware_reward_fn/std": 0.2502395808696747, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1095.0, "completions/max_terminated_length": 1095.0, "completions/mean_length": 339.046875, "completions/mean_terminated_length": 339.046875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.556988138215575, "grad_norm": 0.22838547825813293, "kl": 0.0206451416015625, "learning_rate": 1e-06, "loss": 0.0144, "num_tokens": 59196046.0, "reward": 1.0982143878936768, "reward_std": 0.14367347955703735, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843171834946, "rewards/curriculum_aware_reward_fn/mean": 0.1049107164144516, "rewards/curriculum_aware_reward_fn/std": 0.3067808747291565, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2902.0, "completions/max_terminated_length": 2902.0, "completions/mean_length": 311.7276916503906, "completions/mean_terminated_length": 311.7276916503906, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.5611139762764311, "grad_norm": 0.22647054493427277, "kl": 0.01638031005859375, "learning_rate": 1e-06, "loss": -0.0059, "num_tokens": 59597535.0, "reward": 1.0736607313156128, "reward_std": 0.1215285211801529, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.0848214253783226, "rewards/curriculum_aware_reward_fn/std": 0.2789272665977478, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 988.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 319.9107360839844, "completions/mean_terminated_length": 319.9107360839844, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.5652398143372873, "grad_norm": 0.2566641867160797, "kl": 0.01656341552734375, "learning_rate": 1e-06, "loss": 0.0081, "num_tokens": 60006365.0, "reward": 1.0982143878936768, "reward_std": 0.1455349326133728, "rewards/code_format_reward/mean": 0.9821428656578064, "rewards/code_format_reward/std": 0.13258016109466553, "rewards/curriculum_aware_reward_fn/mean": 0.1160714253783226, "rewards/curriculum_aware_reward_fn/std": 0.32066863775253296, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1955.0, "completions/max_terminated_length": 1955.0, "completions/mean_length": 340.65179443359375, "completions/mean_terminated_length": 340.65179443359375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.5693656523981434, "grad_norm": 0.20979535579681396, "kl": 0.0205078125, "learning_rate": 1e-06, "loss": 0.0289, "num_tokens": 60434910.0, "reward": 1.087053656578064, "reward_std": 0.1503688544034958, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.0982142835855484, "rewards/curriculum_aware_reward_fn/std": 0.3053533434867859, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1004.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 323.08038330078125, "completions/mean_terminated_length": 323.08038330078125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.5734914904589995, "grad_norm": 0.2643740177154541, "kl": 0.02382659912109375, "learning_rate": 1e-06, "loss": 0.0339, "num_tokens": 60830411.0, "reward": 1.080357313156128, "reward_std": 0.17777277529239655, "rewards/code_format_reward/mean": 0.9732142686843872, "rewards/code_format_reward/std": 0.1616371124982834, "rewards/curriculum_aware_reward_fn/mean": 0.1071428582072258, "rewards/curriculum_aware_reward_fn/std": 0.3096405565738678, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1315.0, "completions/max_terminated_length": 1315.0, "completions/mean_length": 330.63616943359375, "completions/mean_terminated_length": 330.63616943359375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.5776173285198556, "grad_norm": 1.9929475784301758, "kl": 0.2246856689453125, "learning_rate": 1e-06, "loss": 0.0138, "num_tokens": 61253888.0, "reward": 1.1071428060531616, "reward_std": 0.19415274262428284, "rewards/code_format_reward/mean": 0.9866071343421936, "rewards/code_format_reward/std": 0.11507843434810638, "rewards/curriculum_aware_reward_fn/mean": 0.1205357164144516, "rewards/curriculum_aware_reward_fn/std": 0.3259509205818176, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 875.0, "completions/max_terminated_length": 875.0, "completions/mean_length": 340.703125, "completions/mean_terminated_length": 340.703125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.5817431665807117, "grad_norm": 0.23709121346473694, "kl": 0.0193634033203125, "learning_rate": 1e-06, "loss": 0.0088, "num_tokens": 61677241.0, "reward": 1.055803656578064, "reward_std": 0.15146084129810333, "rewards/code_format_reward/mean": 0.984375, "rewards/code_format_reward/std": 0.12415824085474014, "rewards/curriculum_aware_reward_fn/mean": 0.0714285746216774, "rewards/curriculum_aware_reward_fn/std": 0.2578272819519043, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2750.0, "completions/max_terminated_length": 2750.0, "completions/mean_length": 362.0401916503906, "completions/mean_terminated_length": 362.0401916503906, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.5858690046415678, "grad_norm": 0.22590979933738708, "kl": 0.014862060546875, "learning_rate": 1e-06, "loss": 0.0223, "num_tokens": 62129193.0, "reward": 1.0535714626312256, "reward_std": 0.15240903198719025, "rewards/code_format_reward/mean": 0.9799107313156128, "rewards/code_format_reward/std": 0.14046260714530945, "rewards/curriculum_aware_reward_fn/mean": 0.0736607164144516, "rewards/curriculum_aware_reward_fn/std": 0.2615099549293518, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1124.0, "completions/max_terminated_length": 1124.0, "completions/mean_length": 364.64288330078125, "completions/mean_terminated_length": 364.64288330078125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.589994842702424, "grad_norm": 0.23674999177455902, "kl": 0.01902008056640625, "learning_rate": 1e-06, "loss": 0.0109, "num_tokens": 62574016.0, "reward": 1.0736607313156128, "reward_std": 0.17122377455234528, "rewards/code_format_reward/mean": 0.9754464030265808, "rewards/code_format_reward/std": 0.1549331247806549, "rewards/curriculum_aware_reward_fn/mean": 0.0982142835855484, "rewards/curriculum_aware_reward_fn/std": 0.29793688654899597, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1067.0, "completions/max_terminated_length": 1067.0, "completions/mean_length": 335.16741943359375, "completions/mean_terminated_length": 335.16741943359375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.5941206807632801, "grad_norm": 0.260344922542572, "kl": 0.02294921875, "learning_rate": 1e-06, "loss": 0.0252, "num_tokens": 62983234.0, "reward": 1.0334821939468384, "reward_std": 0.17160393297672272, "rewards/code_format_reward/mean": 0.9709821343421936, "rewards/code_format_reward/std": 0.16804419457912445, "rewards/curriculum_aware_reward_fn/mean": 0.0625, "rewards/curriculum_aware_reward_fn/std": 0.24233205616474152, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 363.0469055175781, "completions/mean_terminated_length": 354.69573974609375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.5982465188241362, "grad_norm": 0.23047298192977905, "kl": 0.01924896240234375, "learning_rate": 1e-06, "loss": 0.0373, "num_tokens": 63415218.0, "reward": 1.0535714626312256, "reward_std": 0.18685583770275116, "rewards/code_format_reward/mean": 0.96875, "rewards/code_format_reward/std": 0.17418713867664337, "rewards/curriculum_aware_reward_fn/mean": 0.0848214253783226, "rewards/curriculum_aware_reward_fn/std": 0.2789272665977478, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1210.0, "completions/max_terminated_length": 1210.0, "completions/mean_length": 337.6227722167969, "completions/mean_terminated_length": 337.6227722167969, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.6023723568849922, "grad_norm": 0.29973822832107544, "kl": 0.01711273193359375, "learning_rate": 1e-06, "loss": 0.0132, "num_tokens": 63822155.0, "reward": 1.0848215818405151, "reward_std": 0.24713656306266785, "rewards/code_format_reward/mean": 0.9575892686843872, "rewards/code_format_reward/std": 0.20174957811832428, "rewards/curriculum_aware_reward_fn/mean": 0.1272321492433548, "rewards/curriculum_aware_reward_fn/std": 0.3531506359577179, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1333.0, "completions/max_terminated_length": 1333.0, "completions/mean_length": 362.8906555175781, "completions/mean_terminated_length": 362.8906555175781, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.6064981949458483, "grad_norm": 0.2629507780075073, "kl": 0.01544189453125, "learning_rate": 1e-06, "loss": 0.0213, "num_tokens": 64250534.0, "reward": 1.0334821939468384, "reward_std": 0.156258687376976, "rewards/code_format_reward/mean": 0.9754464030265808, "rewards/code_format_reward/std": 0.1549331247806549, "rewards/curriculum_aware_reward_fn/mean": 0.0580357126891613, "rewards/curriculum_aware_reward_fn/std": 0.23407234251499176, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2097.0, "completions/max_terminated_length": 2097.0, "completions/mean_length": 347.1852722167969, "completions/mean_terminated_length": 347.1852722167969, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.6106240330067045, "grad_norm": 0.2853531837463379, "kl": 0.0186767578125, "learning_rate": 1e-06, "loss": 0.0187, "num_tokens": 64677673.0, "reward": 1.046875, "reward_std": 0.2197796106338501, "rewards/code_format_reward/mean": 0.9553571343421936, "rewards/code_format_reward/std": 0.2067493349313736, "rewards/curriculum_aware_reward_fn/mean": 0.0915178582072258, "rewards/curriculum_aware_reward_fn/std": 0.2886664867401123, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 967.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 343.26116943359375, "completions/mean_terminated_length": 343.26116943359375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.6147498710675606, "grad_norm": 0.28099095821380615, "kl": 0.0206756591796875, "learning_rate": 1e-06, "loss": 0.0096, "num_tokens": 65119889.0, "reward": 1.0200893878936768, "reward_std": 0.19929464161396027, "rewards/code_format_reward/mean": 0.9486607313156128, "rewards/code_format_reward/std": 0.22093553841114044, "rewards/curriculum_aware_reward_fn/mean": 0.0714285746216774, "rewards/curriculum_aware_reward_fn/std": 0.2578272819519043, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1387.0, "completions/max_terminated_length": 1387.0, "completions/mean_length": 388.1942138671875, "completions/mean_terminated_length": 388.1942138671875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.6188757091284167, "grad_norm": 0.2474757879972458, "kl": 0.01856231689453125, "learning_rate": 1e-06, "loss": -0.0051, "num_tokens": 65579491.0, "reward": 0.9933035969734192, "reward_std": 0.19513337314128876, "rewards/code_format_reward/mean": 0.9397321343421936, "rewards/code_format_reward/std": 0.23824848234653473, "rewards/curriculum_aware_reward_fn/mean": 0.0535714291036129, "rewards/curriculum_aware_reward_fn/std": 0.225421741604805, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 982.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 320.4442138671875, "completions/mean_terminated_length": 320.4442138671875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.6230015471892728, "grad_norm": 0.30954962968826294, "kl": 0.02806854248046875, "learning_rate": 1e-06, "loss": 0.0178, "num_tokens": 65986138.0, "reward": 1.0647321939468384, "reward_std": 0.23255980014801025, "rewards/code_format_reward/mean": 0.9508928656578064, "rewards/code_format_reward/std": 0.2163332849740982, "rewards/curriculum_aware_reward_fn/mean": 0.1138392835855484, "rewards/curriculum_aware_reward_fn/std": 0.31797105073928833, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1394.0, "completions/max_terminated_length": 1394.0, "completions/mean_length": 330.9442138671875, "completions/mean_terminated_length": 330.9442138671875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.6271273852501289, "grad_norm": 0.2500414252281189, "kl": 0.0207977294921875, "learning_rate": 1e-06, "loss": 0.0154, "num_tokens": 66391343.0, "reward": 1.0825893878936768, "reward_std": 0.16329465806484222, "rewards/code_format_reward/mean": 0.9620535969734192, "rewards/code_format_reward/std": 0.19128035008907318, "rewards/curriculum_aware_reward_fn/mean": 0.1205357164144516, "rewards/curriculum_aware_reward_fn/std": 0.3259509205818176, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3017.0, "completions/max_terminated_length": 3017.0, "completions/mean_length": 344.4754638671875, "completions/mean_terminated_length": 344.4754638671875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.631253223310985, "grad_norm": 0.3076120913028717, "kl": 0.0164947509765625, "learning_rate": 1e-06, "loss": 0.0479, "num_tokens": 66820005.0, "reward": 1.0446430444717407, "reward_std": 0.24730618298053741, "rewards/code_format_reward/mean": 0.9486607313156128, "rewards/code_format_reward/std": 0.22093553841114044, "rewards/curriculum_aware_reward_fn/mean": 0.0959821417927742, "rewards/curriculum_aware_reward_fn/std": 0.29489606618881226, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1016.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 354.52679443359375, "completions/mean_terminated_length": 354.52679443359375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.6353790613718412, "grad_norm": 0.23487085103988647, "kl": 0.01856231689453125, "learning_rate": 1e-06, "loss": 0.0069, "num_tokens": 67248774.0, "reward": 1.0424107313156128, "reward_std": 0.14456433057785034, "rewards/code_format_reward/mean": 0.9776785969734192, "rewards/code_format_reward/std": 0.14789186418056488, "rewards/curriculum_aware_reward_fn/mean": 0.0647321417927742, "rewards/curriculum_aware_reward_fn/std": 0.24632768332958221, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1624.0, "completions/max_terminated_length": 1624.0, "completions/mean_length": 358.5714416503906, "completions/mean_terminated_length": 358.5714416503906, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.6395048994326973, "grad_norm": 0.20852208137512207, "kl": 0.0164794921875, "learning_rate": 1e-06, "loss": 0.0114, "num_tokens": 67691997.0, "reward": 1.0736608505249023, "reward_std": 0.13752707839012146, "rewards/code_format_reward/mean": 0.984375, "rewards/code_format_reward/std": 0.12415824085474014, "rewards/curriculum_aware_reward_fn/mean": 0.0892857164144516, "rewards/curriculum_aware_reward_fn/std": 0.2854745090007782, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1211.0, "completions/max_terminated_length": 1211.0, "completions/mean_length": 321.5089416503906, "completions/mean_terminated_length": 321.5089416503906, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.6436307374935534, "grad_norm": 0.22546347975730896, "kl": 0.020050048828125, "learning_rate": 1e-06, "loss": 0.0111, "num_tokens": 68111469.0, "reward": 1.1049108505249023, "reward_std": 0.1619170904159546, "rewards/code_format_reward/mean": 0.9799107313156128, "rewards/code_format_reward/std": 0.14046260714530945, "rewards/curriculum_aware_reward_fn/mean": 0.125, "rewards/curriculum_aware_reward_fn/std": 0.3310886323451996, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 985.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 330.9419860839844, "completions/mean_terminated_length": 330.9419860839844, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.6477565755544095, "grad_norm": 0.25880876183509827, "kl": 0.02001953125, "learning_rate": 1e-06, "loss": 0.029, "num_tokens": 68537344.0, "reward": 1.0781251192092896, "reward_std": 0.1824197918176651, "rewards/code_format_reward/mean": 0.9754464030265808, "rewards/code_format_reward/std": 0.1549331247806549, "rewards/curriculum_aware_reward_fn/mean": 0.1026785746216774, "rewards/curriculum_aware_reward_fn/std": 0.31826144456863403, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1420.0, "completions/max_terminated_length": 1420.0, "completions/mean_length": 337.4196472167969, "completions/mean_terminated_length": 337.4196472167969, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.6518824136152656, "grad_norm": 0.2588788866996765, "kl": 0.02129364013671875, "learning_rate": 1e-06, "loss": 0.01, "num_tokens": 68932731.0, "reward": 1.102678656578064, "reward_std": 0.20531734824180603, "rewards/code_format_reward/mean": 0.9754464030265808, "rewards/code_format_reward/std": 0.1549331247806549, "rewards/curriculum_aware_reward_fn/mean": 0.1272321492433548, "rewards/curriculum_aware_reward_fn/std": 0.3336053788661957, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 1166.0, "completions/mean_length": 346.05804443359375, "completions/mean_terminated_length": 337.6689147949219, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.6560082516761218, "grad_norm": 0.21358352899551392, "kl": 0.0220184326171875, "learning_rate": 1e-06, "loss": 0.0254, "num_tokens": 69368178.0, "reward": 1.046875, "reward_std": 0.129477858543396, "rewards/code_format_reward/mean": 0.9776785969734192, "rewards/code_format_reward/std": 0.14789186418056488, "rewards/curriculum_aware_reward_fn/mean": 0.0691964253783226, "rewards/curriculum_aware_reward_fn/std": 0.25407159328460693, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1490.0, "completions/max_terminated_length": 1490.0, "completions/mean_length": 355.29241943359375, "completions/mean_terminated_length": 355.29241943359375, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.6601340897369778, "grad_norm": 0.19402597844600677, "kl": 0.02142333984375, "learning_rate": 1e-06, "loss": -0.0008, "num_tokens": 69827215.0, "reward": 1.0758929252624512, "reward_std": 0.1538485586643219, "rewards/code_format_reward/mean": 0.9799107313156128, "rewards/code_format_reward/std": 0.14046260714530945, "rewards/curriculum_aware_reward_fn/mean": 0.0959821417927742, "rewards/curriculum_aware_reward_fn/std": 0.29489603638648987, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 969.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 337.3839416503906, "completions/mean_terminated_length": 337.3839416503906, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.6642599277978339, "grad_norm": 0.1917792558670044, "kl": 0.017913818359375, "learning_rate": 1e-06, "loss": -0.002, "num_tokens": 70250999.0, "reward": 1.0758928060531616, "reward_std": 0.12565498054027557, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.0848214253783226, "rewards/curriculum_aware_reward_fn/std": 0.2789272665977478, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 821.0, "completions/max_terminated_length": 821.0, "completions/mean_length": 329.546875, "completions/mean_terminated_length": 329.546875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.66838576585869, "grad_norm": 0.2092115879058838, "kl": 0.018951416015625, "learning_rate": 1e-06, "loss": 0.0238, "num_tokens": 70667527.0, "reward": 1.0669643878936768, "reward_std": 0.12470243126153946, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.0758928582072258, "rewards/curriculum_aware_reward_fn/std": 0.265122652053833, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1247.0, "completions/max_terminated_length": 1247.0, "completions/mean_length": 332.4888610839844, "completions/mean_terminated_length": 332.4888610839844, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.6725116039195461, "grad_norm": 0.2524375021457672, "kl": 0.02923583984375, "learning_rate": 1e-06, "loss": 0.0089, "num_tokens": 71083322.0, "reward": 1.0803571939468384, "reward_std": 0.16666661202907562, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.0915178582072258, "rewards/curriculum_aware_reward_fn/std": 0.2886664867401123, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1509.0, "completions/max_terminated_length": 1509.0, "completions/mean_length": 322.6808166503906, "completions/mean_terminated_length": 322.6808166503906, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.6766374419804023, "grad_norm": 0.20356769859790802, "kl": 0.01926422119140625, "learning_rate": 1e-06, "loss": 0.007, "num_tokens": 71498482.0, "reward": 1.0602679252624512, "reward_std": 0.10799860954284668, "rewards/code_format_reward/mean": 0.984375, "rewards/code_format_reward/std": 0.12415824085474014, "rewards/curriculum_aware_reward_fn/mean": 0.0758928582072258, "rewards/curriculum_aware_reward_fn/std": 0.265122652053833, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1009.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 332.8125, "completions/mean_terminated_length": 332.8125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.6807632800412584, "grad_norm": 0.2344810515642166, "kl": 0.030609130859375, "learning_rate": 1e-06, "loss": 0.0112, "num_tokens": 71927295.0, "reward": 1.0736607313156128, "reward_std": 0.17476198077201843, "rewards/code_format_reward/mean": 0.9821428656578064, "rewards/code_format_reward/std": 0.13258016109466553, "rewards/curriculum_aware_reward_fn/mean": 0.0915178582072258, "rewards/curriculum_aware_reward_fn/std": 0.2886664867401123, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1042.0, "completions/max_terminated_length": 1042.0, "completions/mean_length": 325.7857360839844, "completions/mean_terminated_length": 325.7857360839844, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.6848891181021145, "grad_norm": 0.20018766820430756, "kl": 0.017852783203125, "learning_rate": 1e-06, "loss": 0.0075, "num_tokens": 72358462.0, "reward": 1.0959821939468384, "reward_std": 0.1302640587091446, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.1026785746216774, "rewards/curriculum_aware_reward_fn/std": 0.30387789011001587, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 926.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 298.6317138671875, "completions/mean_terminated_length": 298.6317138671875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.6890149561629706, "grad_norm": 0.2147868126630783, "kl": 0.0235443115234375, "learning_rate": 1e-06, "loss": 0.0143, "num_tokens": 72758627.0, "reward": 1.1294643878936768, "reward_std": 0.1563330590724945, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.1361607164144516, "rewards/curriculum_aware_reward_fn/std": 0.3497976064682007, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 827.0, "completions/mean_length": 358.42413330078125, "completions/mean_terminated_length": 350.0626525878906, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.6931407942238267, "grad_norm": 0.1920340359210968, "kl": 0.0211029052734375, "learning_rate": 1e-06, "loss": 0.0262, "num_tokens": 73209869.0, "reward": 1.0066964626312256, "reward_std": 0.11701971292495728, "rewards/code_format_reward/mean": 0.9754464030265808, "rewards/code_format_reward/std": 0.1549331247806549, "rewards/curriculum_aware_reward_fn/mean": 0.03125, "rewards/curriculum_aware_reward_fn/std": 0.17418713867664337, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1009.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 318.828125, "completions/mean_terminated_length": 318.828125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.6972666322846828, "grad_norm": 0.2298915535211563, "kl": 0.027587890625, "learning_rate": 1e-06, "loss": 0.0122, "num_tokens": 73638157.0, "reward": 1.0959821939468384, "reward_std": 0.15831470489501953, "rewards/code_format_reward/mean": 0.9866071343421936, "rewards/code_format_reward/std": 0.11507844179868698, "rewards/curriculum_aware_reward_fn/mean": 0.109375, "rewards/curriculum_aware_reward_fn/std": 0.3124580383300781, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 860.0, "completions/max_terminated_length": 860.0, "completions/mean_length": 317.1696472167969, "completions/mean_terminated_length": 317.1696472167969, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.701392470345539, "grad_norm": 0.21549735963344574, "kl": 0.0198516845703125, "learning_rate": 1e-06, "loss": -0.0046, "num_tokens": 74072443.0, "reward": 1.09375, "reward_std": 0.11881917715072632, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.1026785746216774, "rewards/curriculum_aware_reward_fn/std": 0.30387791991233826, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 972.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 336.6785888671875, "completions/mean_terminated_length": 336.6785888671875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.7055183084063951, "grad_norm": 0.20629988610744476, "kl": 0.025146484375, "learning_rate": 1e-06, "loss": 0.0099, "num_tokens": 74497094.0, "reward": 1.078125, "reward_std": 0.1222337856888771, "rewards/code_format_reward/mean": 0.984375, "rewards/code_format_reward/std": 0.12415824085474014, "rewards/curriculum_aware_reward_fn/mean": 0.09375, "rewards/curriculum_aware_reward_fn/std": 0.2918064594268799, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 967.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 314.9933166503906, "completions/mean_terminated_length": 314.9933166503906, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.7096441464672512, "grad_norm": 0.2502371072769165, "kl": 0.021575927734375, "learning_rate": 1e-06, "loss": 0.0037, "num_tokens": 74898653.0, "reward": 1.0959821939468384, "reward_std": 0.1645059585571289, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.1071428582072258, "rewards/curriculum_aware_reward_fn/std": 0.3096405565738678, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1020.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 309.25, "completions/mean_terminated_length": 309.25, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.7137699845281072, "grad_norm": 0.23979751765727997, "kl": 0.02185821533203125, "learning_rate": 1e-06, "loss": 0.0133, "num_tokens": 75294118.0, "reward": 1.1584821939468384, "reward_std": 0.1748540848493576, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.1674107164144516, "rewards/curriculum_aware_reward_fn/std": 0.37375950813293457, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 998.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 300.0133972167969, "completions/mean_terminated_length": 300.0133972167969, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.7178958225889633, "grad_norm": 0.2390657663345337, "kl": 0.0233306884765625, "learning_rate": 1e-06, "loss": 0.0065, "num_tokens": 75695091.0, "reward": 1.1116071939468384, "reward_std": 0.14733438193798065, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.1160714253783226, "rewards/curriculum_aware_reward_fn/std": 0.3275708258152008, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 915.0, "completions/max_terminated_length": 915.0, "completions/mean_length": 309.01116943359375, "completions/mean_terminated_length": 309.01116943359375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.7220216606498195, "grad_norm": 0.20473162829875946, "kl": 0.0201416015625, "learning_rate": 1e-06, "loss": -0.0002, "num_tokens": 76132092.0, "reward": 1.1116071939468384, "reward_std": 0.14241951704025269, "rewards/code_format_reward/mean": 1.0, "rewards/code_format_reward/std": 0.0, "rewards/curriculum_aware_reward_fn/mean": 0.1116071417927742, "rewards/curriculum_aware_reward_fn/std": 0.315234512090683, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3040.0, "completions/max_terminated_length": 3040.0, "completions/mean_length": 329.359375, "completions/mean_terminated_length": 329.359375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.7261474987106756, "grad_norm": 0.20908260345458984, "kl": 0.02310943603515625, "learning_rate": 1e-06, "loss": 0.0141, "num_tokens": 76548522.0, "reward": 1.1116071939468384, "reward_std": 0.1505260318517685, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.1205357164144516, "rewards/curriculum_aware_reward_fn/std": 0.3259509205818176, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 891.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 324.7098388671875, "completions/mean_terminated_length": 324.7098388671875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.7302733367715317, "grad_norm": 0.2211509495973587, "kl": 0.02474212646484375, "learning_rate": 1e-06, "loss": 0.002, "num_tokens": 76962372.0, "reward": 1.0736607313156128, "reward_std": 0.13854160904884338, "rewards/code_format_reward/mean": 0.984375, "rewards/code_format_reward/std": 0.12415824085474014, "rewards/curriculum_aware_reward_fn/mean": 0.0892857164144516, "rewards/curriculum_aware_reward_fn/std": 0.2854744791984558, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 730.0, "completions/max_terminated_length": 730.0, "completions/mean_length": 318.7544860839844, "completions/mean_terminated_length": 318.7544860839844, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.7343991748323878, "grad_norm": 0.19926562905311584, "kl": 0.0231475830078125, "learning_rate": 1e-06, "loss": 0.0166, "num_tokens": 77385566.0, "reward": 1.09375, "reward_std": 0.1347728669643402, "rewards/code_format_reward/mean": 0.9821428656578064, "rewards/code_format_reward/std": 0.13258016109466553, "rewards/curriculum_aware_reward_fn/mean": 0.1116071417927742, "rewards/curriculum_aware_reward_fn/std": 0.31523454189300537, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1221.0, "completions/max_terminated_length": 1221.0, "completions/mean_length": 317.44866943359375, "completions/mean_terminated_length": 317.44866943359375, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.7385250128932439, "grad_norm": 0.23011335730552673, "kl": 0.02362823486328125, "learning_rate": 1e-06, "loss": 0.028, "num_tokens": 77776864.0, "reward": 1.1004464626312256, "reward_std": 0.15760944783687592, "rewards/code_format_reward/mean": 0.984375, "rewards/code_format_reward/std": 0.12415824085474014, "rewards/curriculum_aware_reward_fn/mean": 0.1160714253783226, "rewards/curriculum_aware_reward_fn/std": 0.32066863775253296, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1092.0, "completions/max_terminated_length": 1092.0, "completions/mean_length": 348.8951110839844, "completions/mean_terminated_length": 348.8951110839844, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.7426508509541001, "grad_norm": 0.2574416995048523, "kl": 0.02266693115234375, "learning_rate": 1e-06, "loss": 0.0219, "num_tokens": 78192855.0, "reward": 1.0513393878936768, "reward_std": 0.16110500693321228, "rewards/code_format_reward/mean": 0.984375, "rewards/code_format_reward/std": 0.12415824085474014, "rewards/curriculum_aware_reward_fn/mean": 0.0669642835855484, "rewards/curriculum_aware_reward_fn/std": 0.2502395808696747, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1612.0, "completions/max_terminated_length": 1612.0, "completions/mean_length": 351.3058166503906, "completions/mean_terminated_length": 351.3058166503906, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.7467766890149562, "grad_norm": 0.2253522127866745, "kl": 0.01789093017578125, "learning_rate": 1e-06, "loss": 0.0126, "num_tokens": 78631794.0, "reward": 1.1227679252624512, "reward_std": 0.1853947788476944, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.1339285671710968, "rewards/curriculum_aware_reward_fn/std": 0.34095630049705505, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 863.0, "completions/max_terminated_length": 863.0, "completions/mean_length": 332.0357360839844, "completions/mean_terminated_length": 332.0357360839844, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.7509025270758123, "grad_norm": 0.20990048348903656, "kl": 0.031982421875, "learning_rate": 1e-06, "loss": 0.0185, "num_tokens": 79035710.0, "reward": 1.0580357313156128, "reward_std": 0.12470243871212006, "rewards/code_format_reward/mean": 0.984375, "rewards/code_format_reward/std": 0.12415824085474014, "rewards/curriculum_aware_reward_fn/mean": 0.0736607164144516, "rewards/curriculum_aware_reward_fn/std": 0.2615099549293518, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 909.0, "completions/max_terminated_length": 909.0, "completions/mean_length": 307.3258972167969, "completions/mean_terminated_length": 307.3258972167969, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.7550283651366684, "grad_norm": 0.20495188236236572, "kl": 0.018707275390625, "learning_rate": 1e-06, "loss": 0.0095, "num_tokens": 79432819.0, "reward": 1.102678656578064, "reward_std": 0.13131339848041534, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.109375, "rewards/curriculum_aware_reward_fn/std": 0.3124580383300781, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 898.0, "completions/max_terminated_length": 898.0, "completions/mean_length": 335.05804443359375, "completions/mean_terminated_length": 335.05804443359375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.7591542031975245, "grad_norm": 0.20847569406032562, "kl": 0.0222625732421875, "learning_rate": 1e-06, "loss": 0.0134, "num_tokens": 79867608.0, "reward": 1.0959821939468384, "reward_std": 0.13643288612365723, "rewards/code_format_reward/mean": 0.9866071343421936, "rewards/code_format_reward/std": 0.11507843434810638, "rewards/curriculum_aware_reward_fn/mean": 0.109375, "rewards/curriculum_aware_reward_fn/std": 0.3124580383300781, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1293.0, "completions/max_terminated_length": 1293.0, "completions/mean_length": 337.1607360839844, "completions/mean_terminated_length": 337.1607360839844, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.7632800412583806, "grad_norm": 0.20025186240673065, "kl": 0.02020263671875, "learning_rate": 1e-06, "loss": 0.03, "num_tokens": 80288485.0, "reward": 1.0982143878936768, "reward_std": 0.14409975707530975, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843171834946, "rewards/curriculum_aware_reward_fn/mean": 0.1049107164144516, "rewards/curriculum_aware_reward_fn/std": 0.3067809045314789, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 937.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 316.796875, "completions/mean_terminated_length": 316.796875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.7674058793192368, "grad_norm": 0.1984976828098297, "kl": 0.01725006103515625, "learning_rate": 1e-06, "loss": 0.0043, "num_tokens": 80690432.0, "reward": 1.171875, "reward_std": 0.14828035235404968, "rewards/code_format_reward/mean": 1.0, "rewards/code_format_reward/std": 0.0, "rewards/curriculum_aware_reward_fn/mean": 0.171875, "rewards/curriculum_aware_reward_fn/std": 0.38935965299606323, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1355.0, "completions/max_terminated_length": 1355.0, "completions/mean_length": 332.6183166503906, "completions/mean_terminated_length": 332.6183166503906, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.7715317173800929, "grad_norm": 0.16310814023017883, "kl": 0.0291900634765625, "learning_rate": 1e-06, "loss": 0.0088, "num_tokens": 81107006.0, "reward": 1.0535714626312256, "reward_std": 0.089268259704113, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.0558035708963871, "rewards/curriculum_aware_reward_fn/std": 0.22979861497879028, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 960.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 325.65625, "completions/mean_terminated_length": 325.65625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.7756575554409489, "grad_norm": 0.23649069666862488, "kl": 0.02215576171875, "learning_rate": 1e-06, "loss": -0.0056, "num_tokens": 81515729.0, "reward": 1.0892858505249023, "reward_std": 0.14969965815544128, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.09375, "rewards/curriculum_aware_reward_fn/std": 0.2918064594268799, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1095.0, "completions/max_terminated_length": 1095.0, "completions/mean_length": 324.5201110839844, "completions/mean_terminated_length": 324.5201110839844, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.779783393501805, "grad_norm": 0.23050464689731598, "kl": 0.02477264404296875, "learning_rate": 1e-06, "loss": 0.0112, "num_tokens": 81919204.0, "reward": 1.087053656578064, "reward_std": 0.1559111475944519, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.0959821417927742, "rewards/curriculum_aware_reward_fn/std": 0.29489606618881226, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 909.0, "completions/max_terminated_length": 909.0, "completions/mean_length": 314.15179443359375, "completions/mean_terminated_length": 314.15179443359375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.7839092315626611, "grad_norm": 0.19827330112457275, "kl": 0.02080535888671875, "learning_rate": 1e-06, "loss": 0.0008, "num_tokens": 82312679.0, "reward": 1.1116071939468384, "reward_std": 0.12813948094844818, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.1138392835855484, "rewards/curriculum_aware_reward_fn/std": 0.3249305486679077, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1007.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 331.8326110839844, "completions/mean_terminated_length": 331.8326110839844, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.7880350696235173, "grad_norm": 0.22612139582633972, "kl": 0.0218505859375, "learning_rate": 1e-06, "loss": 0.0111, "num_tokens": 82710470.0, "reward": 1.1607143878936768, "reward_std": 0.15348076820373535, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843171834946, "rewards/curriculum_aware_reward_fn/mean": 0.1674107164144516, "rewards/curriculum_aware_reward_fn/std": 0.37375950813293457, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 937.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 349.28350830078125, "completions/mean_terminated_length": 349.28350830078125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.7921609076843734, "grad_norm": 0.17097440361976624, "kl": 0.02294921875, "learning_rate": 1e-06, "loss": 0.0131, "num_tokens": 83159962.0, "reward": 1.0691964626312256, "reward_std": 0.09517396241426468, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.0736607164144516, "rewards/curriculum_aware_reward_fn/std": 0.2615099549293518, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1317.0, "completions/max_terminated_length": 1317.0, "completions/mean_length": 334.3571472167969, "completions/mean_terminated_length": 334.3571472167969, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.7962867457452295, "grad_norm": 0.19761425256729126, "kl": 0.0179901123046875, "learning_rate": 1e-06, "loss": 0.006, "num_tokens": 83583139.0, "reward": 1.118303656578064, "reward_std": 0.11141322553157806, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.1205357164144516, "rewards/curriculum_aware_reward_fn/std": 0.3259509205818176, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 996.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 346.2544860839844, "completions/mean_terminated_length": 346.2544860839844, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.8004125838060856, "grad_norm": 0.17466260492801666, "kl": 0.0179595947265625, "learning_rate": 1e-06, "loss": 0.0079, "num_tokens": 84002651.0, "reward": 1.1361607313156128, "reward_std": 0.11848875135183334, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.140625, "rewards/curriculum_aware_reward_fn/std": 0.3480229377746582, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 820.0, "completions/max_terminated_length": 820.0, "completions/mean_length": 339.45538330078125, "completions/mean_terminated_length": 339.45538330078125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.8045384218669417, "grad_norm": 0.2132083773612976, "kl": 0.02034759521484375, "learning_rate": 1e-06, "loss": 0.0076, "num_tokens": 84413510.0, "reward": 1.1227679252624512, "reward_std": 0.16031090915203094, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.1272321492433548, "rewards/curriculum_aware_reward_fn/std": 0.3336053788661957, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1127.0, "completions/max_terminated_length": 1127.0, "completions/mean_length": 338.56475830078125, "completions/mean_terminated_length": 338.56475830078125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.8086642599277978, "grad_norm": 0.23344184458255768, "kl": 0.0220794677734375, "learning_rate": 1e-06, "loss": 0.0037, "num_tokens": 84836675.0, "reward": 1.118303656578064, "reward_std": 0.16146619617938995, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843171834946, "rewards/curriculum_aware_reward_fn/mean": 0.125, "rewards/curriculum_aware_reward_fn/std": 0.3310886323451996, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1070.0, "completions/max_terminated_length": 1070.0, "completions/mean_length": 357.12725830078125, "completions/mean_terminated_length": 357.12725830078125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.812790097988654, "grad_norm": 0.1769321709871292, "kl": 0.0223846435546875, "learning_rate": 1e-06, "loss": 0.002, "num_tokens": 85263697.0, "reward": 1.0535714626312256, "reward_std": 0.10079409927129745, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.0625, "rewards/curriculum_aware_reward_fn/std": 0.24233205616474152, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1185.0, "completions/max_terminated_length": 1185.0, "completions/mean_length": 331.84600830078125, "completions/mean_terminated_length": 331.84600830078125, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.8169159360495101, "grad_norm": 0.19233635067939758, "kl": 0.020965576171875, "learning_rate": 1e-06, "loss": 0.0038, "num_tokens": 85666233.0, "reward": 1.1294643878936768, "reward_std": 0.11654725670814514, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843171834946, "rewards/curriculum_aware_reward_fn/mean": 0.1361607164144516, "rewards/curriculum_aware_reward_fn/std": 0.34334251284599304, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 933.0, "completions/max_terminated_length": 933.0, "completions/mean_length": 319.0848388671875, "completions/mean_terminated_length": 319.0848388671875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.8210417741103662, "grad_norm": 0.20209157466888428, "kl": 0.0204010009765625, "learning_rate": 1e-06, "loss": 0.0043, "num_tokens": 86068323.0, "reward": 1.1540179252624512, "reward_std": 0.13614733517169952, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.15625, "rewards/curriculum_aware_reward_fn/std": 0.36349809169769287, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1064.0, "completions/max_terminated_length": 1064.0, "completions/mean_length": 333.359375, "completions/mean_terminated_length": 333.359375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.8251676121712223, "grad_norm": 0.19651298224925995, "kl": 0.0203094482421875, "learning_rate": 1e-06, "loss": 0.0008, "num_tokens": 86499628.0, "reward": 1.087053656578064, "reward_std": 0.12051526457071304, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.0915178582072258, "rewards/curriculum_aware_reward_fn/std": 0.2886664867401123, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 798.0, "completions/mean_length": 327.5602722167969, "completions/mean_terminated_length": 319.1297607421875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.8292934502320783, "grad_norm": 0.18175944685935974, "kl": 0.0191650390625, "learning_rate": 1e-06, "loss": 0.0234, "num_tokens": 86915077.0, "reward": 1.109375, "reward_std": 0.109377421438694, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.1183035746216774, "rewards/curriculum_aware_reward_fn/std": 0.32332828640937805, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1077.0, "completions/max_terminated_length": 1077.0, "completions/mean_length": 329.15625, "completions/mean_terminated_length": 329.15625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.8334192882929345, "grad_norm": 0.20381583273410797, "kl": 0.02025604248046875, "learning_rate": 1e-06, "loss": 0.0022, "num_tokens": 87321756.0, "reward": 1.0647321939468384, "reward_std": 0.13746020197868347, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.0714285746216774, "rewards/curriculum_aware_reward_fn/std": 0.2578272819519043, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1313.0, "completions/max_terminated_length": 1313.0, "completions/mean_length": 347.59600830078125, "completions/mean_terminated_length": 347.59600830078125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.8375451263537906, "grad_norm": 0.19007371366024017, "kl": 0.0234375, "learning_rate": 1e-06, "loss": 0.0011, "num_tokens": 87759481.0, "reward": 1.040178656578064, "reward_std": 0.08854056894779205, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.0491071417927742, "rewards/curriculum_aware_reward_fn/std": 0.2163332849740982, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1044.0, "completions/max_terminated_length": 1044.0, "completions/mean_length": 310.5714416503906, "completions/mean_terminated_length": 310.5714416503906, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.8416709644146467, "grad_norm": 0.20127519965171814, "kl": 0.0208587646484375, "learning_rate": 1e-06, "loss": 0.0154, "num_tokens": 88163448.0, "reward": 1.0892857313156128, "reward_std": 0.10625765472650528, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.09375, "rewards/curriculum_aware_reward_fn/std": 0.2918064594268799, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 731.0, "completions/max_terminated_length": 731.0, "completions/mean_length": 314.85491943359375, "completions/mean_terminated_length": 314.85491943359375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.8457968024755028, "grad_norm": 0.2518065869808197, "kl": 0.0211944580078125, "learning_rate": 1e-06, "loss": 0.0072, "num_tokens": 88569285.0, "reward": 1.133928656578064, "reward_std": 0.21362663805484772, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.1428571492433548, "rewards/curriculum_aware_reward_fn/std": 0.3503182828426361, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1066.0, "completions/max_terminated_length": 1066.0, "completions/mean_length": 324.6383972167969, "completions/mean_terminated_length": 324.6383972167969, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.8499226405363589, "grad_norm": 0.1998155266046524, "kl": 0.0198211669921875, "learning_rate": 1e-06, "loss": 0.0094, "num_tokens": 88975837.0, "reward": 1.0892857313156128, "reward_std": 0.11600644886493683, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843171834946, "rewards/curriculum_aware_reward_fn/mean": 0.0959821417927742, "rewards/curriculum_aware_reward_fn/std": 0.29489603638648987, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 830.0, "completions/max_terminated_length": 830.0, "completions/mean_length": 311.2098388671875, "completions/mean_terminated_length": 311.2098388671875, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.8540484785972151, "grad_norm": 0.2548258602619171, "kl": 0.022491455078125, "learning_rate": 1e-06, "loss": 0.0107, "num_tokens": 89371977.0, "reward": 1.1227679252624512, "reward_std": 0.18092425167560577, "rewards/code_format_reward/mean": 0.984375, "rewards/code_format_reward/std": 0.12415824085474014, "rewards/curriculum_aware_reward_fn/mean": 0.1383928507566452, "rewards/curriculum_aware_reward_fn/std": 0.34569787979125977, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1709.0, "completions/max_terminated_length": 1709.0, "completions/mean_length": 333.171875, "completions/mean_terminated_length": 333.171875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.8581743166580712, "grad_norm": 0.21088841557502747, "kl": 0.0226287841796875, "learning_rate": 1e-06, "loss": 0.0148, "num_tokens": 89782450.0, "reward": 1.0825893878936768, "reward_std": 0.13442879915237427, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.0915178582072258, "rewards/curriculum_aware_reward_fn/std": 0.2886664867401123, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 1248.0, "completions/mean_length": 334.8973388671875, "completions/mean_terminated_length": 326.48321533203125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.8623001547189273, "grad_norm": 0.20233199000358582, "kl": 0.0198974609375, "learning_rate": 1e-06, "loss": 0.0205, "num_tokens": 90211364.0, "reward": 1.1205357313156128, "reward_std": 0.1518603265285492, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.1294642835855484, "rewards/curriculum_aware_reward_fn/std": 0.3360883891582489, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1373.0, "completions/max_terminated_length": 1373.0, "completions/mean_length": 327.75, "completions/mean_terminated_length": 327.75, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.8664259927797834, "grad_norm": 0.22493985295295715, "kl": 0.0274200439453125, "learning_rate": 1e-06, "loss": 0.0099, "num_tokens": 90624889.0, "reward": 1.1294643878936768, "reward_std": 0.1635512113571167, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.140625, "rewards/curriculum_aware_reward_fn/std": 0.3480229377746582, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1149.0, "completions/max_terminated_length": 1149.0, "completions/mean_length": 312.17413330078125, "completions/mean_terminated_length": 312.17413330078125, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.8705518308406395, "grad_norm": 0.25043389201164246, "kl": 0.022552490234375, "learning_rate": 1e-06, "loss": 0.0107, "num_tokens": 91019473.0, "reward": 1.0848214626312256, "reward_std": 0.17821677029132843, "rewards/code_format_reward/mean": 0.9866071343421936, "rewards/code_format_reward/std": 0.11507843434810638, "rewards/curriculum_aware_reward_fn/mean": 0.0982142835855484, "rewards/curriculum_aware_reward_fn/std": 0.31259387731552124, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1260.0, "completions/max_terminated_length": 1260.0, "completions/mean_length": 331.09375, "completions/mean_terminated_length": 331.09375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.8746776689014956, "grad_norm": 0.224375382065773, "kl": 0.0216522216796875, "learning_rate": 1e-06, "loss": 0.0117, "num_tokens": 91431318.0, "reward": 1.1116071939468384, "reward_std": 0.15868249535560608, "rewards/code_format_reward/mean": 0.9866071343421936, "rewards/code_format_reward/std": 0.11507843434810638, "rewards/curriculum_aware_reward_fn/mean": 0.125, "rewards/curriculum_aware_reward_fn/std": 0.3310886323451996, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1204.0, "completions/max_terminated_length": 1204.0, "completions/mean_length": 338.93975830078125, "completions/mean_terminated_length": 338.93975830078125, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.8788035069623518, "grad_norm": 0.21637342870235443, "kl": 0.02056884765625, "learning_rate": 1e-06, "loss": 0.0104, "num_tokens": 91855064.0, "reward": 1.09375, "reward_std": 0.15141817927360535, "rewards/code_format_reward/mean": 0.984375, "rewards/code_format_reward/std": 0.12415824085474014, "rewards/curriculum_aware_reward_fn/mean": 0.109375, "rewards/curriculum_aware_reward_fn/std": 0.3124580383300781, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1318.0, "completions/max_terminated_length": 1318.0, "completions/mean_length": 343.9464416503906, "completions/mean_terminated_length": 343.9464416503906, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.8829293450232079, "grad_norm": 0.24078968167304993, "kl": 0.01845550537109375, "learning_rate": 1e-06, "loss": 0.0086, "num_tokens": 92289257.0, "reward": 1.1004464626312256, "reward_std": 0.19930830597877502, "rewards/code_format_reward/mean": 0.9866071343421936, "rewards/code_format_reward/std": 0.11507843434810638, "rewards/curriculum_aware_reward_fn/mean": 0.1138392835855484, "rewards/curriculum_aware_reward_fn/std": 0.31797105073928833, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1147.0, "completions/max_terminated_length": 1147.0, "completions/mean_length": 332.5379638671875, "completions/mean_terminated_length": 332.5379638671875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.887055183084064, "grad_norm": 0.21402831375598907, "kl": 0.027587890625, "learning_rate": 1e-06, "loss": 0.0078, "num_tokens": 92697416.0, "reward": 1.102678656578064, "reward_std": 0.1518242359161377, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.1138392835855484, "rewards/curriculum_aware_reward_fn/std": 0.31797102093696594, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 997.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 350.8794860839844, "completions/mean_terminated_length": 350.8794860839844, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.89118102114492, "grad_norm": 0.2337924987077713, "kl": 0.023590087890625, "learning_rate": 1e-06, "loss": 0.0102, "num_tokens": 93123774.0, "reward": 1.0892857313156128, "reward_std": 0.17216628789901733, "rewards/code_format_reward/mean": 0.9776785969734192, "rewards/code_format_reward/std": 0.1478918492794037, "rewards/curriculum_aware_reward_fn/mean": 0.1116071417927742, "rewards/curriculum_aware_reward_fn/std": 0.315234512090683, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1084.0, "completions/max_terminated_length": 1084.0, "completions/mean_length": 330.06475830078125, "completions/mean_terminated_length": 330.06475830078125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.8953068592057761, "grad_norm": 0.22798487544059753, "kl": 0.021392822265625, "learning_rate": 1e-06, "loss": 0.0201, "num_tokens": 93533765.0, "reward": 1.0982143878936768, "reward_std": 0.16151104867458344, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.1026785746216774, "rewards/curriculum_aware_reward_fn/std": 0.30387791991233826, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 852.0, "completions/max_terminated_length": 852.0, "completions/mean_length": 314.48663330078125, "completions/mean_terminated_length": 314.48663330078125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.8994326972666323, "grad_norm": 0.21752700209617615, "kl": 0.0204010009765625, "learning_rate": 1e-06, "loss": 0.0191, "num_tokens": 93932052.0, "reward": 1.1361607313156128, "reward_std": 0.1445801705121994, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.1383928507566452, "rewards/curriculum_aware_reward_fn/std": 0.35210978984832764, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1153.0, "completions/max_terminated_length": 1153.0, "completions/mean_length": 331.5669860839844, "completions/mean_terminated_length": 331.5669860839844, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.9035585353274884, "grad_norm": 0.17947368323802948, "kl": 0.0288543701171875, "learning_rate": 1e-06, "loss": -0.001, "num_tokens": 94350941.0, "reward": 1.078125, "reward_std": 0.08682204782962799, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.0892857164144516, "rewards/curriculum_aware_reward_fn/std": 0.2854744791984558, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1358.0, "completions/max_terminated_length": 1358.0, "completions/mean_length": 322.22991943359375, "completions/mean_terminated_length": 322.22991943359375, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.9076843733883445, "grad_norm": 0.2510084807872772, "kl": 0.027130126953125, "learning_rate": 1e-06, "loss": 0.018, "num_tokens": 94749953.0, "reward": 1.1316965818405151, "reward_std": 0.19105976819992065, "rewards/code_format_reward/mean": 0.9776785969734192, "rewards/code_format_reward/std": 0.1478918492794037, "rewards/curriculum_aware_reward_fn/mean": 0.1540178507566452, "rewards/curriculum_aware_reward_fn/std": 0.36136940121650696, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2368.0, "completions/max_terminated_length": 2368.0, "completions/mean_length": 334.0870666503906, "completions/mean_terminated_length": 334.0870666503906, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.9118102114492006, "grad_norm": 0.24003419280052185, "kl": 0.0258636474609375, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 95151344.0, "reward": 1.109375, "reward_std": 0.16909699141979218, "rewards/code_format_reward/mean": 0.9776785969734192, "rewards/code_format_reward/std": 0.1478918492794037, "rewards/curriculum_aware_reward_fn/mean": 0.1316964328289032, "rewards/curriculum_aware_reward_fn/std": 0.33853843808174133, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1094.0, "completions/max_terminated_length": 1094.0, "completions/mean_length": 326.65850830078125, "completions/mean_terminated_length": 326.65850830078125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.9159360495100567, "grad_norm": 0.20545540750026703, "kl": 0.0222930908203125, "learning_rate": 1e-06, "loss": 0.0175, "num_tokens": 95561167.0, "reward": 1.1160714626312256, "reward_std": 0.1285231113433838, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.125, "rewards/curriculum_aware_reward_fn/std": 0.3310886323451996, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 345.76116943359375, "completions/mean_terminated_length": 345.76116943359375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.9200618875709129, "grad_norm": 0.21364757418632507, "kl": 0.01959228515625, "learning_rate": 1e-06, "loss": 0.0035, "num_tokens": 95981263.0, "reward": 1.0892858505249023, "reward_std": 0.1504049301147461, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.09375, "rewards/curriculum_aware_reward_fn/std": 0.3210110366344452, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 886.0, "completions/max_terminated_length": 886.0, "completions/mean_length": 348.83929443359375, "completions/mean_terminated_length": 348.83929443359375, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.924187725631769, "grad_norm": 0.22457292675971985, "kl": 0.02020263671875, "learning_rate": 1e-06, "loss": 0.0123, "num_tokens": 96415429.0, "reward": 1.095982313156128, "reward_std": 0.14929361641407013, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.1026785746216774, "rewards/curriculum_aware_reward_fn/std": 0.30387791991233826, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 857.0, "completions/max_terminated_length": 857.0, "completions/mean_length": 318.4196472167969, "completions/mean_terminated_length": 318.4196472167969, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.9283135636926251, "grad_norm": 0.21325407922267914, "kl": 0.0233306884765625, "learning_rate": 1e-06, "loss": 0.0095, "num_tokens": 96826699.0, "reward": 1.1205357313156128, "reward_std": 0.1431247889995575, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.1316964328289032, "rewards/curriculum_aware_reward_fn/std": 0.33853843808174133, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 968.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 324.5446472167969, "completions/mean_terminated_length": 324.5446472167969, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.9324394017534812, "grad_norm": 0.18811924755573273, "kl": 0.02105712890625, "learning_rate": 1e-06, "loss": 0.013, "num_tokens": 97235680.0, "reward": 1.1049107313156128, "reward_std": 0.11635051667690277, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.1071428582072258, "rewards/curriculum_aware_reward_fn/std": 0.3096405565738678, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 898.0, "completions/max_terminated_length": 898.0, "completions/mean_length": 321.546875, "completions/mean_terminated_length": 321.546875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.9365652398143373, "grad_norm": 0.2765014171600342, "kl": 0.042205810546875, "learning_rate": 1e-06, "loss": 0.0077, "num_tokens": 97642366.0, "reward": 1.1272321939468384, "reward_std": 0.17475977540016174, "rewards/code_format_reward/mean": 0.9821428656578064, "rewards/code_format_reward/std": 0.13258016109466553, "rewards/curriculum_aware_reward_fn/mean": 0.1450892835855484, "rewards/curriculum_aware_reward_fn/std": 0.352584570646286, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1082.0, "completions/max_terminated_length": 1082.0, "completions/mean_length": 327.7701110839844, "completions/mean_terminated_length": 327.7701110839844, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.9406910778751933, "grad_norm": 0.20436213910579681, "kl": 0.021942138671875, "learning_rate": 1e-06, "loss": 0.0184, "num_tokens": 98054290.0, "reward": 1.095982313156128, "reward_std": 0.1473296880722046, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.1071428582072258, "rewards/curriculum_aware_reward_fn/std": 0.3096405565738678, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1130.0, "completions/max_terminated_length": 1130.0, "completions/mean_length": 347.2076110839844, "completions/mean_terminated_length": 347.2076110839844, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.9448169159360496, "grad_norm": 0.1630670577287674, "kl": 0.0210113525390625, "learning_rate": 1e-06, "loss": 0.0172, "num_tokens": 98490938.0, "reward": 1.087053656578064, "reward_std": 0.11560040712356567, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.0915178582072258, "rewards/curriculum_aware_reward_fn/std": 0.2886664867401123, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1187.0, "completions/max_terminated_length": 1187.0, "completions/mean_length": 332.33929443359375, "completions/mean_terminated_length": 332.33929443359375, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.9489427539969056, "grad_norm": 0.20399163663387299, "kl": 0.0230560302734375, "learning_rate": 1e-06, "loss": 0.0123, "num_tokens": 98909228.0, "reward": 1.1071429252624512, "reward_std": 0.13823235034942627, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.1160714253783226, "rewards/curriculum_aware_reward_fn/std": 0.32066863775253296, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 992.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 313.04913330078125, "completions/mean_terminated_length": 313.04913330078125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.9530685920577617, "grad_norm": 0.26219725608825684, "kl": 0.0257720947265625, "learning_rate": 1e-06, "loss": 0.0056, "num_tokens": 99311106.0, "reward": 1.1428571939468384, "reward_std": 0.19518183171749115, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.1495535671710968, "rewards/curriculum_aware_reward_fn/std": 0.3570319712162018, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1106.0, "completions/max_terminated_length": 1106.0, "completions/mean_length": 304.67413330078125, "completions/mean_terminated_length": 304.67413330078125, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.9571944301186178, "grad_norm": 0.2229895293712616, "kl": 0.023284912109375, "learning_rate": 1e-06, "loss": 0.0051, "num_tokens": 99714307.0, "reward": 1.1227679252624512, "reward_std": 0.1438300609588623, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.1272321492433548, "rewards/curriculum_aware_reward_fn/std": 0.3336053788661957, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 969.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 324.36163330078125, "completions/mean_terminated_length": 324.36163330078125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.9613202681794739, "grad_norm": 0.24561771750450134, "kl": 0.0222320556640625, "learning_rate": 1e-06, "loss": 0.0044, "num_tokens": 100127420.0, "reward": 1.125, "reward_std": 0.15938647091388702, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.1272321492433548, "rewards/curriculum_aware_reward_fn/std": 0.3336053788661957, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 888.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 319.1339416503906, "completions/mean_terminated_length": 319.1339416503906, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.9654461062403301, "grad_norm": 0.20435009896755219, "kl": 0.026947021484375, "learning_rate": 1e-06, "loss": 0.013, "num_tokens": 100542349.0, "reward": 1.1049108505249023, "reward_std": 0.13719666004180908, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.109375, "rewards/curriculum_aware_reward_fn/std": 0.3124580383300781, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 949.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 342.3951110839844, "completions/mean_terminated_length": 342.3951110839844, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.9695719443011862, "grad_norm": 0.22168368101119995, "kl": 0.027984619140625, "learning_rate": 1e-06, "loss": 0.0019, "num_tokens": 100966712.0, "reward": 1.1517857313156128, "reward_std": 0.16110500693321228, "rewards/code_format_reward/mean": 0.9866071343421936, "rewards/code_format_reward/std": 0.11507843434810638, "rewards/curriculum_aware_reward_fn/mean": 0.1651785671710968, "rewards/curriculum_aware_reward_fn/std": 0.3836035132408142, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1383.0, "completions/max_terminated_length": 1383.0, "completions/mean_length": 310.33929443359375, "completions/mean_terminated_length": 310.33929443359375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.9736977823620423, "grad_norm": 0.24561944603919983, "kl": 0.024139404296875, "learning_rate": 1e-06, "loss": 0.003, "num_tokens": 101380200.0, "reward": 1.071428656578064, "reward_std": 0.1601366102695465, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.0803571417927742, "rewards/curriculum_aware_reward_fn/std": 0.2721492052078247, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1154.0, "completions/max_terminated_length": 1154.0, "completions/mean_length": 316.3817138671875, "completions/mean_terminated_length": 316.3817138671875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.9778236204228984, "grad_norm": 0.21199537813663483, "kl": 0.02339935302734375, "learning_rate": 1e-06, "loss": 0.0179, "num_tokens": 101778223.0, "reward": 1.09375, "reward_std": 0.14510643482208252, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.0982142835855484, "rewards/curriculum_aware_reward_fn/std": 0.29793688654899597, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 329.2410888671875, "completions/mean_terminated_length": 320.8143310546875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.9819494584837545, "grad_norm": 0.2752473056316376, "kl": 0.0230255126953125, "learning_rate": 1e-06, "loss": 0.0188, "num_tokens": 102201677.0, "reward": 1.140625238418579, "reward_std": 0.21461744606494904, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.1517857164144516, "rewards/curriculum_aware_reward_fn/std": 0.3592142164707184, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1206.0, "completions/max_terminated_length": 1206.0, "completions/mean_length": 330.46429443359375, "completions/mean_terminated_length": 330.46429443359375, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.9860752965446106, "grad_norm": 0.21098829805850983, "kl": 0.025848388671875, "learning_rate": 1e-06, "loss": 0.0064, "num_tokens": 102616978.0, "reward": 1.1004464626312256, "reward_std": 0.10864535719156265, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.1116071417927742, "rewards/curriculum_aware_reward_fn/std": 0.31523454189300537, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1439.0, "completions/max_terminated_length": 1439.0, "completions/mean_length": 348.0133972167969, "completions/mean_terminated_length": 348.0133972167969, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.9902011346054668, "grad_norm": 0.21229997277259827, "kl": 0.0264892578125, "learning_rate": 1e-06, "loss": 0.0061, "num_tokens": 103045643.0, "reward": 1.0625, "reward_std": 0.12955878674983978, "rewards/code_format_reward/mean": 0.984375, "rewards/code_format_reward/std": 0.12415824085474014, "rewards/curriculum_aware_reward_fn/mean": 0.078125, "rewards/curriculum_aware_reward_fn/std": 0.26866820454597473, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1043.0, "completions/max_terminated_length": 1043.0, "completions/mean_length": 320.203125, "completions/mean_terminated_length": 320.203125, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.9943269726663229, "grad_norm": 0.2476748675107956, "kl": 0.019500732421875, "learning_rate": 1e-06, "loss": 0.0034, "num_tokens": 103465384.0, "reward": 1.1205358505249023, "reward_std": 0.18537454307079315, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.125, "rewards/curriculum_aware_reward_fn/std": 0.3310886323451996, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 910.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 328.1678771972656, "completions/mean_terminated_length": 328.1678771972656, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.998452810727179, "grad_norm": 0.16760054230690002, "kl": 0.0231475830078125, "learning_rate": 1e-06, "loss": 0.0068, "num_tokens": 103885406.0, "reward": 1.055803656578064, "reward_std": 0.09345544129610062, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.0647321417927742, "rewards/curriculum_aware_reward_fn/std": 0.24632768332958221, "step": 242 }, { "epoch": 0.998452810727179, "step": 242, "total_flos": 0.0, "train_loss": 0.01242125062383798, "train_runtime": 20692.6662, "train_samples_per_second": 0.749, "train_steps_per_second": 0.012 } ], "logging_steps": 1, "max_steps": 242, "num_input_tokens_seen": 103885406, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }